1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM x86_fpu #if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FPU_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(x86_fpu, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu), TP_STRUCT__entry( __field(struct fpu *, fpu) __field(bool, load_fpu) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->load_fpu, __entry->xfeatures, __entry->xcomp_bv ) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_restore, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_restore, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_src, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH asm/trace/ #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE fpu #endif /* _TRACE_FPU_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/file_table.c * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> #include <linux/rcupdate.h> #include <linux/mount.h> #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/task_work.h> #include <linux/ima.h> #include <linux/swap.h> #include <linux/atomic.h> #include "internal.h" /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE }; /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; static void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_u.fu_rcuhead); put_cred(f->f_cred); kmem_cache_free(filp_cachep, f); } static inline void file_free(struct file *f) { security_file_free(f); if (!(f->f_mode & FMODE_NOACCOUNT)) percpu_counter_dec(&nr_files); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } /* * Return the total number of open files in the system */ static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } /* * Return the maximum number of open files in the system */ unsigned long get_max_files(void) { return files_stat.max_files; } EXPORT_SYMBOL_GPL(get_max_files); /* * Handle nr_files sysctl */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) int proc_nr_files(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } #endif static struct file *__alloc_file(int flags, const struct cred *cred) { struct file *f; int error; f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { file_free_rcu(&f->f_u.fu_rcuhead); return ERR_PTR(error); } atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); eventpoll_init_file(f); f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ return f; } /* Find an unused file structure and return a pointer to it. * Returns an error pointer if some error happend e.g. we over file * structures limit, run out of memory or operation is not permitted. * * Be very careful using this. You are responsible for * getting write access to any mount that you might assign * to this filp, if it is opened for write. If this is not * done, you will imbalance int the mount's writer count * and a warning at __fput() time. */ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; /* * Privileged users can go above max_files */ if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) goto over; } f = __alloc_file(flags, cred); if (!IS_ERR(f)) percpu_counter_inc(&nr_files); return f; over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } return ERR_PTR(-ENFILE); } /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * * Should not be used unless there's a very good reason to do so. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { struct file *f = __alloc_file(flags, cred); if (!IS_ERR(f)) f->f_mode |= FMODE_NOACCOUNT; return f; } /** * alloc_file - allocate and initialize a 'struct file' * * @path: the (dentry, vfsmount) pair for the new file * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ static struct file *alloc_file(const struct path *path, int flags, const struct file_operations *fop) { struct file *file; file = alloc_empty_file(flags, current_cred()); if (IS_ERR(file)) return file; file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); file->f_sb_err = file_sample_sb_err(file); if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); return file; } struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; struct qstr this = QSTR_INIT(name, strlen(name)); struct path path; struct file *file; path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); if (!path.dentry) return ERR_PTR(-ENOMEM); if (!mnt->mnt_sb->s_d_op) d_set_d_op(path.dentry, &anon_ops); path.mnt = mntget(mnt); d_instantiate(path.dentry, inode); file = alloc_file(&path, flags, fops); if (IS_ERR(file)) { ihold(inode); path_put(&path); } return file; } EXPORT_SYMBOL(alloc_file_pseudo); struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { struct file *f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; } return f; } /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; fmode_t mode = file->f_mode; if (unlikely(!(file->f_mode & FMODE_OPENED))) goto out; might_sleep(); fsnotify_close(file); /* * The function eventpoll_release() should be the first called * in the file cleanup chain. */ eventpoll_release(file); locks_remove_file(file); ima_file_free(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); } fops_put(file->f_op); put_pid(file->f_owner.pid); if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); if (mode & FMODE_WRITER) { put_write_access(inode); __mnt_drop_write(mnt); } dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); mntput(mnt); out: file_free(file); } static LLIST_HEAD(delayed_fput_list); static void delayed_fput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_fput_list); struct file *f, *t; llist_for_each_entry_safe(f, t, node, f_u.fu_llist) __fput(f); } static void ____fput(struct callback_head *work) { __fput(container_of(work, struct file, f_u.fu_rcuhead)); } /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we * *do* need to make sure our writes to binaries on initramfs has * not left us with opened struct file waiting for __fput() - execve() * won't work without that. Please, don't add more callers without * very good reasons; in particular, never call that with locks * held and never call that from a thread that might need to do * some work on any kind of umount. */ void flush_delayed_fput(void) { delayed_fput(NULL); } EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); void fput_many(struct file *file, unsigned int refs) { if (atomic_long_sub_and_test(refs, &file->f_count)) { struct task_struct *task = current; if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_u.fu_rcuhead, ____fput); if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)) return; /* * After this task has run exit_task_work(), * task_work_add() will fail. Fall through to delayed * fput to avoid leaking *file. */ } if (llist_add(&file->f_u.fu_llist, &delayed_fput_list)) schedule_delayed_work(&delayed_fput_work, 1); } } void fput(struct file *file) { fput_many(file, 1); } /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without * risking deadlocks), need to wait for completion of __fput() and know * for this specific struct file it won't involve anything that would * need them. Use only if you really need it - at the very least, * don't blindly convert fput() by kernel thread to that. */ void __fput_sync(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; BUG_ON(!(task->flags & PF_KTHREAD)); __fput(file); } } EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } /* * One file with associated inode and dcache is very roughly 1K. Per default * do not use more than 10% of our memory for files. */ void __init files_maxfiles_init(void) { unsigned long n; unsigned long nr_pages = totalram_pages(); unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; memreserve = min(memreserve, nr_pages - 1); n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); }
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Authentication token and access key management * * Copyright (C) 2004, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/security/keys/core.rst for information on keys/keyrings. */ #ifndef _LINUX_KEY_H #define _LINUX_KEY_H #include <linux/types.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> #include <linux/sysctl.h> #include <linux/rwsem.h> #include <linux/atomic.h> #include <linux/assoc_array.h> #include <linux/refcount.h> #include <linux/time64.h> #ifdef __KERNEL__ #include <linux/uidgid.h> /* key handle serial number */ typedef int32_t key_serial_t; /* key handle permissions mask */ typedef uint32_t key_perm_t; struct key; struct net; #ifdef CONFIG_KEYS #undef KEY_DEBUGGING #define KEY_POS_VIEW 0x01000000 /* possessor can view a key's attributes */ #define KEY_POS_READ 0x02000000 /* possessor can read key payload / view keyring */ #define KEY_POS_WRITE 0x04000000 /* possessor can update key payload / add link to keyring */ #define KEY_POS_SEARCH 0x08000000 /* possessor can find a key in search / search a keyring */ #define KEY_POS_LINK 0x10000000 /* possessor can create a link to a key/keyring */ #define KEY_POS_SETATTR 0x20000000 /* possessor can set key attributes */ #define KEY_POS_ALL 0x3f000000 #define KEY_USR_VIEW 0x00010000 /* user permissions... */ #define KEY_USR_READ 0x00020000 #define KEY_USR_WRITE 0x00040000 #define KEY_USR_SEARCH 0x00080000 #define KEY_USR_LINK 0x00100000 #define KEY_USR_SETATTR 0x00200000 #define KEY_USR_ALL 0x003f0000 #define KEY_GRP_VIEW 0x00000100 /* group permissions... */ #define KEY_GRP_READ 0x00000200 #define KEY_GRP_WRITE 0x00000400 #define KEY_GRP_SEARCH 0x00000800 #define KEY_GRP_LINK 0x00001000 #define KEY_GRP_SETATTR 0x00002000 #define KEY_GRP_ALL 0x00003f00 #define KEY_OTH_VIEW 0x00000001 /* third party permissions... */ #define KEY_OTH_READ 0x00000002 #define KEY_OTH_WRITE 0x00000004 #define KEY_OTH_SEARCH 0x00000008 #define KEY_OTH_LINK 0x00000010 #define KEY_OTH_SETATTR 0x00000020 #define KEY_OTH_ALL 0x0000003f #define KEY_PERM_UNDEF 0xffffffff /* * The permissions required on a key that we're looking up. */ enum key_need_perm { KEY_NEED_UNSPECIFIED, /* Needed permission unspecified */ KEY_NEED_VIEW, /* Require permission to view attributes */ KEY_NEED_READ, /* Require permission to read content */ KEY_NEED_WRITE, /* Require permission to update / modify */ KEY_NEED_SEARCH, /* Require permission to search (keyring) or find (key) */ KEY_NEED_LINK, /* Require permission to link */ KEY_NEED_SETATTR, /* Require permission to change attributes */ KEY_NEED_UNLINK, /* Require permission to unlink key */ KEY_SYSADMIN_OVERRIDE, /* Special: override by CAP_SYS_ADMIN */ KEY_AUTHTOKEN_OVERRIDE, /* Special: override by possession of auth token */ KEY_DEFER_PERM_CHECK, /* Special: permission check is deferred */ }; struct seq_file; struct user_struct; struct signal_struct; struct cred; struct key_type; struct key_owner; struct key_tag; struct keyring_list; struct keyring_name; struct key_tag { struct rcu_head rcu; refcount_t usage; bool removed; /* T when subject removed */ }; struct keyring_index_key { /* [!] If this structure is altered, the union in struct key must change too! */ unsigned long hash; /* Hash value */ union { struct { #ifdef __LITTLE_ENDIAN /* Put desc_len at the LSB of x */ u16 desc_len; char desc[sizeof(long) - 2]; /* First few chars of description */ #else char desc[sizeof(long) - 2]; /* First few chars of description */ u16 desc_len; #endif }; unsigned long x; }; struct key_type *type; struct key_tag *domain_tag; /* Domain of operation */ const char *description; }; union key_payload { void __rcu *rcu_data0; void *data[4]; }; /*****************************************************************************/ /* * key reference with possession attribute handling * * NOTE! key_ref_t is a typedef'd pointer to a type that is not actually * defined. This is because we abuse the bottom bit of the reference to carry a * flag to indicate whether the calling process possesses that key in one of * its keyrings. * * the key_ref_t has been made a separate type so that the compiler can reject * attempts to dereference it without proper conversion. * * the three functions are used to assemble and disassemble references */ typedef struct __key_reference_with_attributes *key_ref_t; static inline key_ref_t make_key_ref(const struct key *key, bool possession) { return (key_ref_t) ((unsigned long) key | possession); } static inline struct key *key_ref_to_ptr(const key_ref_t key_ref) { return (struct key *) ((unsigned long) key_ref & ~1UL); } static inline bool is_key_possessed(const key_ref_t key_ref) { return (unsigned long) key_ref & 1UL; } typedef int (*key_restrict_link_func_t)(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key); struct key_restriction { key_restrict_link_func_t check; struct key *key; struct key_type *keytype; }; enum key_state { KEY_IS_UNINSTANTIATED, KEY_IS_POSITIVE, /* Positively instantiated */ }; /*****************************************************************************/ /* * authentication token / access credential / keyring * - types of key include: * - keyrings * - disk encryption IDs * - Kerberos TGTs and tickets */ struct key { refcount_t usage; /* number of references */ key_serial_t serial; /* key serial number */ union { struct list_head graveyard_link; struct rb_node serial_node; }; #ifdef CONFIG_KEY_NOTIFICATIONS struct watch_list *watchers; /* Entities watching this key for changes */ #endif struct rw_semaphore sem; /* change vs change sem */ struct key_user *user; /* owner of this key */ void *security; /* security data for this key */ union { time64_t expiry; /* time at which key expires (or 0) */ time64_t revoked_at; /* time at which key was revoked */ }; time64_t last_used_at; /* last time used for LRU keyring discard */ kuid_t uid; kgid_t gid; key_perm_t perm; /* access permissions */ unsigned short quotalen; /* length added to quota */ unsigned short datalen; /* payload data length * - may not match RCU dereferenced payload * - payload should contain own length */ short state; /* Key state (+) or rejection error (-) */ #ifdef KEY_DEBUGGING unsigned magic; #define KEY_DEBUG_MAGIC 0x18273645u #endif unsigned long flags; /* status flags (change with bitops) */ #define KEY_FLAG_DEAD 0 /* set if key type has been deleted */ #define KEY_FLAG_REVOKED 1 /* set if key had been revoked */ #define KEY_FLAG_IN_QUOTA 2 /* set if key consumes quota */ #define KEY_FLAG_USER_CONSTRUCT 3 /* set if key is being constructed in userspace */ #define KEY_FLAG_ROOT_CAN_CLEAR 4 /* set if key can be cleared by root without permission */ #define KEY_FLAG_INVALIDATED 5 /* set if key has been invalidated */ #define KEY_FLAG_BUILTIN 6 /* set if key is built in to the kernel */ #define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 8 /* set if key should not be removed */ #define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria * - it should be a printable string * - eg: for krb5 AFS, this might be "afs@REDHAT.COM" */ union { struct keyring_index_key index_key; struct { unsigned long hash; unsigned long len_desc; struct key_type *type; /* type of key */ struct key_tag *domain_tag; /* Domain of operation */ char *description; }; }; /* key data * - this is used to hold the data actually used in cryptography or * whatever */ union { union key_payload payload; struct { /* Keyring bits */ struct list_head name_link; struct assoc_array keys; }; }; /* This is set on a keyring to restrict the addition of a link to a key * to it. If this structure isn't provided then it is assumed that the * keyring is open to any addition. It is ignored for non-keyring * keys. Only set this value using keyring_restrict(), keyring_alloc(), * or key_alloc(). * * This is intended for use with rings of trusted keys whereby addition * to the keyring needs to be controlled. KEY_ALLOC_BYPASS_RESTRICTION * overrides this, allowing the kernel to add extra keys without * restriction. */ struct key_restriction *restrict_link; }; extern struct key *key_alloc(struct key_type *type, const char *desc, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link); #define KEY_ALLOC_IN_QUOTA 0x0000 /* add to quota, reject if would overrun */ #define KEY_ALLOC_QUOTA_OVERRUN 0x0001 /* add to quota, permit even if overrun */ #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ #define KEY_ALLOC_BUILT_IN 0x0004 /* Key is built into kernel */ #define KEY_ALLOC_BYPASS_RESTRICTION 0x0008 /* Override the check on restricted keyrings */ #define KEY_ALLOC_UID_KEYRING 0x0010 /* allocating a user or user session keyring */ #define KEY_ALLOC_SET_KEEP 0x0020 /* Set the KEEP flag on the key/keyring */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); extern void key_put(struct key *key); extern bool key_put_tag(struct key_tag *tag); extern void key_remove_domain(struct key_tag *domain_tag); static inline struct key *__key_get(struct key *key) { refcount_inc(&key->usage); return key; } static inline struct key *key_get(struct key *key) { return key ? __key_get(key) : key; } static inline void key_ref_put(key_ref_t key_ref) { key_put(key_ref_to_ptr(key_ref)); } extern struct key *request_key_tag(struct key_type *type, const char *description, struct key_tag *domain_tag, const char *callout_info); extern struct key *request_key_rcu(struct key_type *type, const char *description, struct key_tag *domain_tag); extern struct key *request_key_with_auxdata(struct key_type *type, const char *description, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, void *aux); /** * request_key - Request a key and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @callout_info: The data to pass to the instantiation upcall (or NULL). * * As for request_key_tag(), but with the default global domain tag. */ static inline struct key *request_key(struct key_type *type, const char *description, const char *callout_info) { return request_key_tag(type, description, NULL, callout_info); } #ifdef CONFIG_NET /** * request_key_net - Request a key for a net namespace and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @net: The network namespace that is the key's domain of operation. * @callout_info: The data to pass to the instantiation upcall (or NULL). * * As for request_key() except that it does not add the returned key to a * keyring if found, new keys are always allocated in the user's quota, the * callout_info must be a NUL-terminated string and no auxiliary data can be * passed. Only keys that operate the specified network namespace are used. * * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ #define request_key_net(type, description, net, callout_info) \ request_key_tag(type, description, net->key_domain, callout_info); /** * request_key_net_rcu - Request a key for a net namespace under RCU conditions * @type: Type of key. * @description: The searchable description of the key. * @net: The network namespace that is the key's domain of operation. * * As for request_key_rcu() except that only keys that operate the specified * network namespace are used. */ #define request_key_net_rcu(type, description, net) \ request_key_rcu(type, description, net->key_domain); #endif /* CONFIG_NET */ extern int wait_for_key_construction(struct key *key, bool intr); extern int key_validate(const struct key *key); extern key_ref_t key_create_or_update(key_ref_t keyring, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags); extern int key_update(key_ref_t key, const void *payload, size_t plen); extern int key_link(struct key *keyring, struct key *key); extern int key_move(struct key *key, struct key *from_keyring, struct key *to_keyring, unsigned int flags); extern int key_unlink(struct key *keyring, struct key *key); extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest); extern int restrict_link_reject(struct key *keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key); extern int keyring_clear(struct key *keyring); extern key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, const char *description, bool recurse); extern int keyring_add_key(struct key *keyring, struct key *key); extern int keyring_restrict(key_ref_t keyring, const char *type, const char *restriction); extern struct key *key_lookup(key_serial_t id); static inline key_serial_t key_serial(const struct key *key) { return key ? key->serial : 0; } extern void key_set_timeout(struct key *, unsigned); extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags, enum key_need_perm need_perm); extern void key_free_user_ns(struct user_namespace *); static inline short key_read_state(const struct key *key) { /* Barrier versus mark_key_instantiated(). */ return smp_load_acquire(&key->state); } /** * key_is_positive - Determine if a key has been positively instantiated * @key: The key to check. * * Return true if the specified key has been positively instantiated, false * otherwise. */ static inline bool key_is_positive(const struct key *key) { return key_read_state(key) == KEY_IS_POSITIVE; } static inline bool key_is_negative(const struct key *key) { return key_read_state(key) < 0; } #define dereference_key_rcu(KEY) \ (rcu_dereference((KEY)->payload.rcu_data0)) #define dereference_key_locked(KEY) \ (rcu_dereference_protected((KEY)->payload.rcu_data0, \ rwsem_is_locked(&((struct key *)(KEY))->sem))) #define rcu_assign_keypointer(KEY, PAYLOAD) \ do { \ rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD)); \ } while (0) #ifdef CONFIG_SYSCTL extern struct ctl_table key_sysctls[]; #endif /* * the userspace interface */ extern int install_thread_keyring_to_cred(struct cred *cred); extern void key_fsuid_changed(struct cred *new_cred); extern void key_fsgid_changed(struct cred *new_cred); extern void key_init(void); #else /* CONFIG_KEYS */ #define key_validate(k) 0 #define key_serial(k) 0 #define key_get(k) ({ NULL; }) #define key_revoke(k) do { } while(0) #define key_invalidate(k) do { } while(0) #define key_put(k) do { } while(0) #define key_ref_put(k) do { } while(0) #define make_key_ref(k, p) NULL #define key_ref_to_ptr(k) NULL #define is_key_possessed(k) 0 #define key_fsuid_changed(c) do { } while(0) #define key_fsgid_changed(c) do { } while(0) #define key_init() do { } while(0) #define key_free_user_ns(ns) do { } while(0) #define key_remove_domain(d) do { } while(0) #endif /* CONFIG_KEYS */ #endif /* __KERNEL__ */ #endif /* _LINUX_KEY_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ATOMIC_H #define _ASM_X86_ATOMIC_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/alternative.h> #include <asm/cmpxchg.h> #include <asm/rmwcc.h> #include <asm/barrier.h> /* * Atomic operations that C can't guarantee us. Useful for * resource counting etc.. */ /** * arch_atomic_read - read atomic variable * @v: pointer of type atomic_t * * Atomically reads the value of @v. */ static __always_inline int arch_atomic_read(const atomic_t *v) { /* * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here, * it's non-inlined function that increases binary size and stack usage. */ return __READ_ONCE((v)->counter); } /** * arch_atomic_set - set atomic variable * @v: pointer of type atomic_t * @i: required value * * Atomically sets the value of @v to @i. */ static __always_inline void arch_atomic_set(atomic_t *v, int i) { __WRITE_ONCE(v->counter, i); } /** * arch_atomic_add - add integer to atomic variable * @i: integer value to add * @v: pointer of type atomic_t * * Atomically adds @i to @v. */ static __always_inline void arch_atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) : "ir" (i) : "memory"); } /** * arch_atomic_sub - subtract integer from atomic variable * @i: integer value to subtract * @v: pointer of type atomic_t * * Atomically subtracts @i from @v. */ static __always_inline void arch_atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) : "ir" (i) : "memory"); } /** * arch_atomic_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer of type atomic_t * * Atomically subtracts @i from @v and returns * true if the result is zero, or false for all * other cases. */ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i); } #define arch_atomic_sub_and_test arch_atomic_sub_and_test /** * arch_atomic_inc - increment atomic variable * @v: pointer of type atomic_t * * Atomically increments @v by 1. */ static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_inc arch_atomic_inc /** * arch_atomic_dec - decrement atomic variable * @v: pointer of type atomic_t * * Atomically decrements @v by 1. */ static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_dec arch_atomic_dec /** * arch_atomic_dec_and_test - decrement and test * @v: pointer of type atomic_t * * Atomically decrements @v by 1 and * returns true if the result is 0, or false for all other * cases. */ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e); } #define arch_atomic_dec_and_test arch_atomic_dec_and_test /** * arch_atomic_inc_and_test - increment and test * @v: pointer of type atomic_t * * Atomically increments @v by 1 * and returns true if the result is zero, or false for all * other cases. */ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e); } #define arch_atomic_inc_and_test arch_atomic_inc_and_test /** * arch_atomic_add_negative - add and test if negative * @i: integer value to add * @v: pointer of type atomic_t * * Atomically adds @i to @v and returns true * if the result is negative, or false when * result is greater than or equal to zero. */ static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i); } #define arch_atomic_add_negative arch_atomic_add_negative /** * arch_atomic_add_return - add integer and return * @i: integer value to add * @v: pointer of type atomic_t * * Atomically adds @i to @v and returns @i + @v */ static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } #define arch_atomic_add_return arch_atomic_add_return /** * arch_atomic_sub_return - subtract integer and return * @v: pointer of type atomic_t * @i: integer value to subtract * * Atomically subtracts @i from @v and returns @v - @i */ static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) { return arch_atomic_add_return(-i, v); } #define arch_atomic_sub_return arch_atomic_sub_return static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return xadd(&v->counter, i); } #define arch_atomic_fetch_add arch_atomic_fetch_add static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v) { return xadd(&v->counter, -i); } #define arch_atomic_fetch_sub arch_atomic_fetch_sub static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { return arch_cmpxchg(&v->counter, old, new); } #define arch_atomic_cmpxchg arch_atomic_cmpxchg static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { return try_cmpxchg(&v->counter, old, new); } #define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg static __always_inline int arch_atomic_xchg(atomic_t *v, int new) { return arch_xchg(&v->counter, new); } #define arch_atomic_xchg arch_atomic_xchg static __always_inline void arch_atomic_and(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "andl %1,%0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i)); return val; } #define arch_atomic_fetch_and arch_atomic_fetch_and static __always_inline void arch_atomic_or(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "orl %1,%0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i)); return val; } #define arch_atomic_fetch_or arch_atomic_fetch_or static __always_inline void arch_atomic_xor(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "xorl %1,%0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i)); return val; } #define arch_atomic_fetch_xor arch_atomic_fetch_xor #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> #else # include <asm/atomic64_64.h> #endif #define ARCH_ATOMIC #endif /* _ASM_X86_ATOMIC_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ERR_H #define _LINUX_ERR_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/errno.h> /* * Kernel pointers have redundant information, so we can use a * scheme where we can return either an error code or a normal * pointer with the same return value. * * This should be a per-architecture thing, to allow different * error and pointer decisions. */ #define MAX_ERRNO 4095 #ifndef __ASSEMBLY__ #define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO) static inline void * __must_check ERR_PTR(long error) { return (void *) error; } static inline long __must_check PTR_ERR(__force const void *ptr) { return (long) ptr; } static inline bool __must_check IS_ERR(__force const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) { return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); } /** * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type * @ptr: The pointer to cast. * * Explicitly cast an error-valued pointer to another pointer type in such a * way as to make it clear that's what's going on. */ static inline void * __must_check ERR_CAST(__force const void *ptr) { /* cast away the const */ return (void *) ptr; } static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) { if (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0; } #endif #endif /* _LINUX_ERR_H */
1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 // SPDX-License-Identifier: GPL-2.0-only /* * The "user cache". * * (C) Copyright 1991-2000 Linus Torvalds * * We have a per-user structure to keep track of how many * processes, files etc the user has claimed, in order to be * able to have per-user limits for system resources. */ #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/key.h> #include <linux/sched/user.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> /* * userns count is 1 for root user, 1 for init_uts_ns, * and 1 for... ? */ struct user_namespace init_user_ns = { .uid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .gid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .projid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .count = ATOMIC_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .ns.inum = PROC_USER_INIT_INO, #ifdef CONFIG_USER_NS .ns.ops = &userns_operations, #endif .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif }; EXPORT_SYMBOL_GPL(init_user_ns); /* * UID task count cache, to get fast user lookup in "alloc_uid" * when changing user ID's (ie setuid() and friends). */ #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is * occasionally also taken from softirq/tasklet context, when * task-structs get RCU-freed. Hence all locking must be softirq-safe. * But free_uid() is also called with local interrupts disabled, and running * local_bh_enable() with local interrupts disabled is an error - we'll run * softirq callbacks, and they can unconditionally enable interrupts, and * the caller of free_uid() didn't expect that.. */ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), .processes = ATOMIC_INIT(1), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; /* * These routines must be called with the uidhash spinlock held! */ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) { hlist_add_head(&up->uidhash_node, hashent); } static void uid_hash_remove(struct user_struct *up) { hlist_del_init(&up->uidhash_node); } static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) { struct user_struct *user; hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { refcount_inc(&user->__count); return user; } } return NULL; } /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. */ static void free_user(struct user_struct *up, unsigned long flags) __releases(&uidhash_lock) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); kmem_cache_free(uid_cachep, up); } /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). * * If the user_struct could not be found, return NULL. */ struct user_struct *find_user(kuid_t uid) { struct user_struct *ret; unsigned long flags; spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } void free_uid(struct user_struct *up) { unsigned long flags; if (!up) return; if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); } struct user_struct *alloc_uid(kuid_t uid) { struct hlist_head *hashent = uidhashentry(uid); struct user_struct *up, *new; spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) return NULL; new->uid = uid; refcount_set(&new->__count, 1); ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); /* * Before adding this, check whether we raced * on adding the same user already.. */ spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); up = new; } spin_unlock_irq(&uidhash_lock); } return up; } static int __init uid_cache_init(void) { int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); spin_unlock_irq(&uidhash_lock); return 0; } subsys_initcall(uid_cache_init);
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ioctl.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/syscalls.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/compat.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/falloc.h> #include <linux/sched/signal.h> #include <linux/fiemap.h> #include "internal.h" #include <asm/ioctls.h> /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) /** * vfs_ioctl - call filesystem specific ioctl methods * @filp: open file to invoke ioctl method on * @cmd: ioctl command to execute * @arg: command-specific argument for ioctl * * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise * returns -ENOTTY. * * Returns 0 on success, -errno on error. */ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; if (!filp->f_op->unlocked_ioctl) goto out; error = filp->f_op->unlocked_ioctl(filp, cmd, arg); if (error == -ENOIOCTLCMD) error = -ENOTTY; out: return error; } EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; int error, ur_block; sector_t block; if (!capable(CAP_SYS_RAWIO)) return -EPERM; error = get_user(ur_block, p); if (error) return error; if (ur_block < 0) return -EINVAL; block = ur_block; error = bmap(inode, &block); if (block > INT_MAX) { error = -ERANGE; pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n", current->comm, task_pid_nr(current), sb->s_id, filp); } if (error) ur_block = 0; else ur_block = block; if (put_user(ur_block, p)) error = -EFAULT; return error; } /** * fiemap_fill_next_extent - Fiemap helper function * @fieinfo: Fiemap context passed into ->fiemap * @logical: Extent logical start offset, in bytes * @phys: Extent physical start offset, in bytes * @len: Extent length, in bytes * @flags: FIEMAP_EXTENT flags that describe this extent * * Called from file system ->fiemap callback. Will populate extent * info as passed in via arguments and copy to user memory. On * success, extent count on fieinfo is incremented. * * Returns 0 on success, -errno on error, 1 if this was the last * extent that will fit in user array. */ #define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) #define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) #define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { struct fiemap_extent extent; struct fiemap_extent __user *dest = fieinfo->fi_extents_start; /* only count the extents */ if (fieinfo->fi_extents_max == 0) { fieinfo->fi_extents_mapped++; return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; } if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) return 1; if (flags & SET_UNKNOWN_FLAGS) flags |= FIEMAP_EXTENT_UNKNOWN; if (flags & SET_NO_UNMOUNTED_IO_FLAGS) flags |= FIEMAP_EXTENT_ENCODED; if (flags & SET_NOT_ALIGNED_FLAGS) flags |= FIEMAP_EXTENT_NOT_ALIGNED; memset(&extent, 0, sizeof(extent)); extent.fe_logical = logical; extent.fe_physical = phys; extent.fe_length = len; extent.fe_flags = flags; dest += fieinfo->fi_extents_mapped; if (copy_to_user(dest, &extent, sizeof(extent))) return -EFAULT; fieinfo->fi_extents_mapped++; if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) return 1; return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; } EXPORT_SYMBOL(fiemap_fill_next_extent); /** * fiemap_prep - check validity of requested flags for fiemap * @inode: Inode to operate on * @fieinfo: Fiemap context passed into ->fiemap * @start: Start of the mapped range * @len: Length of the mapped range, can be truncated by this function. * @supported_flags: Set of fiemap flags that the file system understands * * This function must be called from each ->fiemap instance to validate the * fiemap request against the file system parameters. * * Returns 0 on success, or a negative error on failure. */ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 *len, u32 supported_flags) { u64 maxbytes = inode->i_sb->s_maxbytes; u32 incompat_flags; int ret = 0; if (*len == 0) return -EINVAL; if (start >= maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ if (*len > maxbytes || (maxbytes - *len) < start) *len = maxbytes - start; supported_flags |= FIEMAP_FLAG_SYNC; supported_flags &= FIEMAP_FLAGS_COMPAT; incompat_flags = fieinfo->fi_flags & ~supported_flags; if (incompat_flags) { fieinfo->fi_flags = incompat_flags; return -EBADR; } if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) ret = filemap_write_and_wait(inode->i_mapping); return ret; } EXPORT_SYMBOL(fiemap_prep); static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { struct fiemap fiemap; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); int error; if (!inode->i_op->fiemap) return -EOPNOTSUPP; if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) return -EFAULT; if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) error = -EFAULT; return error; } static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { struct fd src_file = fdget(srcfd); loff_t cloned; int ret; if (!src_file.file) return -EBADF; ret = -EXDEV; if (src_file.file->f_path.mnt != dst_file->f_path.mnt) goto fdput; cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen, 0); if (cloned < 0) ret = cloned; else if (olen && cloned != olen) ret = -EINVAL; else ret = 0; fdput: fdput(src_file); return ret; } static long ioctl_file_clone_range(struct file *file, struct file_clone_range __user *argp) { struct file_clone_range args; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; return ioctl_file_clone(file, args.src_fd, args.src_offset, args.src_length, args.dest_offset); } #ifdef CONFIG_BLOCK static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) { return (offset >> inode->i_blkbits); } static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) { return (blk << inode->i_blkbits); } /** * __generic_block_fiemap - FIEMAP for block based inodes (no locking) * @inode: the inode to map * @fieinfo: the fiemap info struct that will be passed back to userspace * @start: where to start mapping in the inode * @len: how much space to map * @get_block: the fs's get_block function * * This does FIEMAP for block based inodes. Basically it will just loop * through get_block until we hit the number of extents we want to map, or we * go past the end of the file and hit a hole. * * If it is possible to have data blocks beyond a hole past @inode->i_size, then * please do not use this function, it will stop at the first unmapped block * beyond i_size. * * If you use this function directly, you need to do your own locking. Use * generic_block_fiemap if you want the locking done for you. */ static int __generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, get_block_t *get_block) { struct buffer_head map_bh; sector_t start_blk, last_blk; loff_t isize = i_size_read(inode); u64 logical = 0, phys = 0, size = 0; u32 flags = FIEMAP_EXTENT_MERGED; bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; /* * Either the i_mutex or other appropriate locking needs to be held * since we expect isize to not change at all through the duration of * this call. */ if (len >= isize) { whole_file = true; len = isize; } /* * Some filesystems can't deal with being asked to map less than * blocksize, so make sure our len is at least block length. */ if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); do { /* * we set b_size to the total size we want so it will map as * many contiguous blocks as possible at once */ memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; ret = get_block(inode, start_blk, &map_bh, 0); if (ret) break; /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk++; /* * We want to handle the case where there is an * allocated block at the front of the file, and then * nothing but holes up to the end of the file properly, * to make sure that extent at the front gets properly * marked with FIEMAP_EXTENT_LAST */ if (!past_eof && blk_to_logical(inode, start_blk) >= isize) past_eof = 1; /* * First hole after going past the EOF, this is our * last extent */ if (past_eof && size) { flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); } else if (size) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); size = 0; } /* if we have holes up to/past EOF then we're done */ if (start_blk > last_blk || past_eof || ret) break; } else { /* * We have gone over the length of what we wanted to * map, and it wasn't the entire file, so add the extent * we got last time and exit. * * This is for the case where say we want to map all the * way up to the second to the last block in a file, but * the last block is a hole, making the second to last * block FIEMAP_EXTENT_LAST. In this case we want to * see if there is a hole after the second to last block * so we can mark it properly. If we found data after * we exceeded the length we were requesting, then we * are good to go, just add the extent to the fieinfo * and break */ if (start_blk > last_blk && !whole_file) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); break; } /* * if size != 0 then we know we already have an extent * to add, so add it. */ if (size) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); if (ret) break; } logical = blk_to_logical(inode, start_blk); phys = blk_to_logical(inode, map_bh.b_blocknr); size = map_bh.b_size; flags = FIEMAP_EXTENT_MERGED; start_blk += logical_to_blk(inode, size); /* * If we are past the EOF, then we need to make sure as * soon as we find a hole that the last extent we found * is marked with FIEMAP_EXTENT_LAST */ if (!past_eof && logical + size >= isize) past_eof = true; } cond_resched(); if (fatal_signal_pending(current)) { ret = -EINTR; break; } } while (1); /* If ret is 1 then we just hit the end of the extent array */ if (ret == 1) ret = 0; return ret; } /** * generic_block_fiemap - FIEMAP for block based inodes * @inode: The inode to map * @fieinfo: The mapping information * @start: The initial block to map * @len: The length of the extect to attempt to map * @get_block: The block mapping function for the fs * * Calls __generic_block_fiemap to map the inode, after taking * the inode's mutex lock. */ int generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, get_block_t *get_block) { int ret; inode_lock(inode); ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); inode_unlock(inode); return ret; } EXPORT_SYMBOL(generic_block_fiemap); #endif /* CONFIG_BLOCK */ /* * This provides compatibility with legacy XFS pre-allocation ioctls * which predate the fallocate syscall. * * Only the l_start, l_len and l_whence fields of the 'struct space_resv' * are used here, rest are ignored. */ static int ioctl_preallocate(struct file *filp, int mode, void __user *argp) { struct inode *inode = file_inode(filp); struct space_resv sr; if (copy_from_user(&sr, argp, sizeof(sr))) return -EFAULT; switch (sr.l_whence) { case SEEK_SET: break; case SEEK_CUR: sr.l_start += filp->f_pos; break; case SEEK_END: sr.l_start += i_size_read(inode); break; default: return -EINVAL; } return vfs_fallocate(filp, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } /* on ia32 l_start is on a 32-bit boundary */ #if defined CONFIG_COMPAT && defined(CONFIG_X86_64) /* just account for different alignment */ static int compat_ioctl_preallocate(struct file *file, int mode, struct space_resv_32 __user *argp) { struct inode *inode = file_inode(file); struct space_resv_32 sr; if (copy_from_user(&sr, argp, sizeof(sr))) return -EFAULT; switch (sr.l_whence) { case SEEK_SET: break; case SEEK_CUR: sr.l_start += file->f_pos; break; case SEEK_END: sr.l_start += i_size_read(inode); break; default: return -EINVAL; } return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } #endif static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p) { switch (cmd) { case FIBMAP: return ioctl_fibmap(filp, p); case FS_IOC_RESVSP: case FS_IOC_RESVSP64: return ioctl_preallocate(filp, 0, p); case FS_IOC_UNRESVSP: case FS_IOC_UNRESVSP64: return ioctl_preallocate(filp, FALLOC_FL_PUNCH_HOLE, p); case FS_IOC_ZERO_RANGE: return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p); } return -ENOIOCTLCMD; } static int ioctl_fionbio(struct file *filp, int __user *argp) { unsigned int flag; int on, error; error = get_user(on, argp); if (error) return error; flag = O_NONBLOCK; #ifdef __sparc__ /* SunOS compatibility item. */ if (O_NONBLOCK != O_NDELAY) flag |= O_NDELAY; #endif spin_lock(&filp->f_lock); if (on) filp->f_flags |= flag; else filp->f_flags &= ~flag; spin_unlock(&filp->f_lock); return error; } static int ioctl_fioasync(unsigned int fd, struct file *filp, int __user *argp) { unsigned int flag; int on, error; error = get_user(on, argp); if (error) return error; flag = on ? FASYNC : 0; /* Did FASYNC state change ? */ if ((flag ^ filp->f_flags) & FASYNC) { if (filp->f_op->fasync) /* fasync() adjusts filp->f_flags */ error = filp->f_op->fasync(fd, filp, on); else error = -ENOTTY; } return error < 0 ? error : 0; } static int ioctl_fsfreeze(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* If filesystem doesn't support freeze feature, return. */ if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL) return -EOPNOTSUPP; /* Freeze */ if (sb->s_op->freeze_super) return sb->s_op->freeze_super(sb); return freeze_super(sb); } static int ioctl_fsthaw(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Thaw */ if (sb->s_op->thaw_super) return sb->s_op->thaw_super(sb); return thaw_super(sb); } static int ioctl_file_dedupe_range(struct file *file, struct file_dedupe_range __user *argp) { struct file_dedupe_range *same = NULL; int ret; unsigned long size; u16 count; if (get_user(count, &argp->dest_count)) { ret = -EFAULT; goto out; } size = offsetof(struct file_dedupe_range __user, info[count]); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; } same = memdup_user(argp, size); if (IS_ERR(same)) { ret = PTR_ERR(same); same = NULL; goto out; } same->dest_count = count; ret = vfs_dedupe_file_range(file, same); if (ret) goto out; ret = copy_to_user(argp, same, size); if (ret) ret = -EFAULT; out: kfree(same); return ret; } /* * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. * * When you add any new common ioctls to the switches above and below, * please ensure they have compatible arguments in compat mode. */ static int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct inode *inode = file_inode(filp); switch (cmd) { case FIOCLEX: set_close_on_exec(fd, 1); return 0; case FIONCLEX: set_close_on_exec(fd, 0); return 0; case FIONBIO: return ioctl_fionbio(filp, argp); case FIOASYNC: return ioctl_fioasync(fd, filp, argp); case FIOQSIZE: if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { loff_t res = inode_get_bytes(inode); return copy_to_user(argp, &res, sizeof(res)) ? -EFAULT : 0; } return -ENOTTY; case FIFREEZE: return ioctl_fsfreeze(filp); case FITHAW: return ioctl_fsthaw(filp); case FS_IOC_FIEMAP: return ioctl_fiemap(filp, argp); case FIGETBSZ: /* anon_bdev filesystems may not have a block size */ if (!inode->i_sb->s_blocksize) return -EINVAL; return put_user(inode->i_sb->s_blocksize, (int __user *)argp); case FICLONE: return ioctl_file_clone(filp, arg, 0, 0, 0); case FICLONERANGE: return ioctl_file_clone_range(filp, argp); case FIDEDUPERANGE: return ioctl_file_dedupe_range(filp, argp); case FIONREAD: if (!S_ISREG(inode->i_mode)) return vfs_ioctl(filp, cmd, arg); return put_user(i_size_read(inode) - filp->f_pos, (int __user *)argp); default: if (S_ISREG(inode->i_mode)) return file_ioctl(filp, cmd, argp); break; } return -ENOIOCTLCMD; } SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { struct fd f = fdget(fd); int error; if (!f.file) return -EBADF; error = security_file_ioctl(f.file, cmd, arg); if (error) goto out; error = do_vfs_ioctl(f.file, fd, cmd, arg); if (error == -ENOIOCTLCMD) error = vfs_ioctl(f.file, cmd, arg); out: fdput(f); return error; } #ifdef CONFIG_COMPAT /** * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation * * This is not normally called as a function, but instead set in struct * file_operations as * * .compat_ioctl = compat_ptr_ioctl, * * On most architectures, the compat_ptr_ioctl() just passes all arguments * to the corresponding ->ioctl handler. The exception is arch/s390, where * compat_ptr() clears the top bit of a 32-bit pointer value, so user space * pointers to the second 2GB alias the first 2GB, as is the case for * native 32-bit s390 user space. * * The compat_ptr_ioctl() function must therefore be used only with ioctl * functions that either ignore the argument or pass a pointer to a * compatible data type. * * If any ioctl command handled by fops->unlocked_ioctl passes a plain * integer instead of a pointer, or any of the passed data types * is incompatible between 32-bit and 64-bit architectures, a proper * handler is required instead of compat_ptr_ioctl. */ long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { if (!file->f_op->unlocked_ioctl) return -ENOIOCTLCMD; return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } EXPORT_SYMBOL(compat_ptr_ioctl); COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { struct fd f = fdget(fd); int error; if (!f.file) return -EBADF; error = security_file_ioctl_compat(f.file, cmd, arg); if (error) goto out; switch (cmd) { /* FICLONE takes an int argument, so don't use compat_ptr() */ case FICLONE: error = ioctl_file_clone(f.file, arg, 0, 0, 0); break; #if defined(CONFIG_X86_64) /* these get messy on amd64 due to alignment differences */ case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg)); break; case FS_IOC_UNRESVSP_32: case FS_IOC_UNRESVSP64_32: error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE, compat_ptr(arg)); break; case FS_IOC_ZERO_RANGE_32: error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE, compat_ptr(arg)); break; #endif /* * everything else in do_vfs_ioctl() takes either a compatible * pointer argument or no argument -- call it with a modified * argument. */ default: error = do_vfs_ioctl(f.file, fd, cmd, (unsigned long)compat_ptr(arg)); if (error != -ENOIOCTLCMD) break; if (f.file->f_op->compat_ioctl) error = f.file->f_op->compat_ioctl(f.file, cmd, arg); if (error == -ENOIOCTLCMD) error = -ENOTTY; break; } out: fdput(f); return error; } #endif
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 // SPDX-License-Identifier: GPL-2.0-or-later /* Manage a process's keyrings * * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/keyctl.h> #include <linux/fs.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/security.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> #include <linux/init_task.h> #include <keys/request_key_auth-type.h> #include "internal.h" /* Session keyring create vs join semaphore */ static DEFINE_MUTEX(key_session_mutex); /* The root user's tracking struct */ struct key_user root_key_user = { .usage = REFCOUNT_INIT(3), .cons_lock = __MUTEX_INITIALIZER(root_key_user.cons_lock), .lock = __SPIN_LOCK_UNLOCKED(root_key_user.lock), .nkeys = ATOMIC_INIT(2), .nikeys = ATOMIC_INIT(2), .uid = GLOBAL_ROOT_UID, }; /* * Get or create a user register keyring. */ static struct key *get_user_register(struct user_namespace *user_ns) { struct key *reg_keyring = READ_ONCE(user_ns->user_keyring_register); if (reg_keyring) return reg_keyring; down_write(&user_ns->keyring_sem); /* Make sure there's a register keyring. It gets owned by the * user_namespace's owner. */ reg_keyring = user_ns->user_keyring_register; if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, &init_cred, KEY_POS_WRITE | KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ, 0, NULL, NULL); if (!IS_ERR(reg_keyring)) smp_store_release(&user_ns->user_keyring_register, reg_keyring); } up_write(&user_ns->keyring_sem); /* We don't return a ref since the keyring is pinned by the user_ns */ return reg_keyring; } /* * Look up the user and user session keyrings for the current process's UID, * creating them if they don't exist. */ int look_up_user_keyrings(struct key **_user_keyring, struct key **_user_session_keyring) { const struct cred *cred = current_cred(); struct user_namespace *user_ns = current_user_ns(); struct key *reg_keyring, *uid_keyring, *session_keyring; key_perm_t user_keyring_perm; key_ref_t uid_keyring_r, session_keyring_r; uid_t uid = from_kuid(user_ns, cred->user->uid); char buf[20]; int ret; user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; kenter("%u", uid); reg_keyring = get_user_register(user_ns); if (IS_ERR(reg_keyring)) return PTR_ERR(reg_keyring); down_write(&user_ns->keyring_sem); ret = 0; /* Get the user keyring. Note that there may be one in existence * already as it may have been pinned by a session, but the user_struct * pointing to it may have been destroyed by setuid. */ snprintf(buf, sizeof(buf), "_uid.%u", uid); uid_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid %p", uid_keyring_r); if (uid_keyring_r == ERR_PTR(-EAGAIN)) { uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, reg_keyring); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); goto error; } } else if (IS_ERR(uid_keyring_r)) { ret = PTR_ERR(uid_keyring_r); goto error; } else { uid_keyring = key_ref_to_ptr(uid_keyring_r); } /* Get a default session keyring (which might also exist already) */ snprintf(buf, sizeof(buf), "_uid_ses.%u", uid); session_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid_ses %p", session_keyring_r); if (session_keyring_r == ERR_PTR(-EAGAIN)) { session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; } /* We install a link from the user session keyring to * the user keyring. */ ret = key_link(session_keyring, uid_keyring); if (ret < 0) goto error_release_session; /* And only then link the user-session keyring to the * register. */ ret = key_link(reg_keyring, session_keyring); if (ret < 0) goto error_release_session; } else if (IS_ERR(session_keyring_r)) { ret = PTR_ERR(session_keyring_r); goto error_release; } else { session_keyring = key_ref_to_ptr(session_keyring_r); } up_write(&user_ns->keyring_sem); if (_user_session_keyring) *_user_session_keyring = session_keyring; else key_put(session_keyring); if (_user_keyring) *_user_keyring = uid_keyring; else key_put(uid_keyring); kleave(" = 0"); return 0; error_release_session: key_put(session_keyring); error_release: key_put(uid_keyring); error: up_write(&user_ns->keyring_sem); kleave(" = %d", ret); return ret; } /* * Get the user session keyring if it exists, but don't create it if it * doesn't. */ struct key *get_user_session_keyring_rcu(const struct cred *cred) { struct key *reg_keyring = READ_ONCE(cred->user_ns->user_keyring_register); key_ref_t session_keyring_r; char buf[20]; struct keyring_search_context ctx = { .index_key.type = &key_type_keyring, .index_key.description = buf, .cred = cred, .match_data.cmp = key_default_cmp, .match_data.raw_data = buf, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; if (!reg_keyring) return NULL; ctx.index_key.desc_len = snprintf(buf, sizeof(buf), "_uid_ses.%u", from_kuid(cred->user_ns, cred->user->uid)); session_keyring_r = keyring_search_rcu(make_key_ref(reg_keyring, true), &ctx); if (IS_ERR(session_keyring_r)) return NULL; return key_ref_to_ptr(session_keyring_r); } /* * Install a thread keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a thread keyring is now present; -errno on failure. */ int install_thread_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->thread_keyring) return 0; keyring = keyring_alloc("_tid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->thread_keyring = keyring; return 0; } /* * Install a thread keyring to the current task if it didn't have one already. * * Return: 0 if a thread keyring is now present; -errno on failure. */ static int install_thread_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_thread_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install a process keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a process keyring is now present; -errno on failure. */ int install_process_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->process_keyring) return 0; keyring = keyring_alloc("_pid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->process_keyring = keyring; return 0; } /* * Install a process keyring to the current task if it didn't have one already. * * Return: 0 if a process keyring is now present; -errno on failure. */ static int install_process_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_process_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install the given keyring as the session keyring of the given credentials * struct, replacing the existing one if any. If the given keyring is NULL, * then install a new anonymous session keyring. * @cred can not be in use by any task yet. * * Return: 0 on success; -errno on failure. */ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) { unsigned long flags; struct key *old; might_sleep(); /* create an empty session keyring */ if (!keyring) { flags = KEY_ALLOC_QUOTA_OVERRUN; if (cred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, flags, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { __key_get(keyring); } /* install the keyring */ old = cred->session_keyring; cred->session_keyring = keyring; if (old) key_put(old); return 0; } /* * Install the given keyring as the session keyring of the current task, * replacing the existing one if any. If the given keyring is NULL, then * install a new anonymous session keyring. * * Return: 0 on success; -errno on failure. */ static int install_session_keyring(struct key *keyring) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Handle the fsuid changing. */ void key_fsuid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->uid = new_cred->fsuid; up_write(&new_cred->thread_keyring->sem); } } /* * Handle the fsgid changing. */ void key_fsgid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->gid = new_cred->fsgid; up_write(&new_cred->thread_keyring->sem); } } /* * Search the process keyrings attached to the supplied cred for the first * matching key under RCU conditions (the caller must be holding the RCU read * lock). * * The search criteria are the type and the match function. The description is * given to the match function as a parameter, but doesn't otherwise influence * the search. Typically the match function will compare the description * parameter to the key's description. * * This can only search keyrings that grant Search permission to the supplied * credentials. Keyrings linked to searched keyrings will also be searched if * they grant Search permission too. Keys can only be found if they grant * Search permission to the credentials. * * Returns a pointer to the key with the key usage count incremented if * successful, -EAGAIN if we didn't find any matching key or -ENOKEY if we only * matched negative keys. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx) { struct key *user_session; key_ref_t key_ref, ret, err; const struct cred *cred = ctx->cred; /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were * searchable, but we failed to find a key or we found a negative key; * otherwise we want to return a sample error (probably -EACCES) if * none of the keyrings were searchable * * in terms of priority: success > -ENOKEY > -EAGAIN > other error */ key_ref = NULL; ret = NULL; err = ERR_PTR(-EAGAIN); /* search the thread keyring first */ if (cred->thread_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->thread_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the process keyring second */ if (cred->process_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->process_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the session keyring */ if (cred->session_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->session_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* or search the user-session keyring */ else if ((user_session = get_user_session_keyring_rcu(cred))) { key_ref = keyring_search_rcu(make_key_ref(user_session, 1), ctx); key_put(user_session); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* no key - decide on the error we're going to go for */ key_ref = ret ? ret : err; found: return key_ref; } /* * Search the process keyrings attached to the supplied cred for the first * matching key in the manner of search_my_process_keyrings(), but also search * the keys attached to the assumed authorisation key using its credentials if * one is available. * * The caller must be holding the RCU read lock. * * Return same as search_cred_keyrings_rcu(). */ key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx) { struct request_key_auth *rka; key_ref_t key_ref, ret = ERR_PTR(-EACCES), err; key_ref = search_cred_keyrings_rcu(ctx); if (!IS_ERR(key_ref)) goto found; err = key_ref; /* if this process has an instantiation authorisation key, then we also * search the keyrings of the process mentioned there * - we don't permit access to request_key auth keys via this method */ if (ctx->cred->request_key_auth && ctx->cred == current_cred() && ctx->index_key.type != &key_type_request_key_auth ) { const struct cred *cred = ctx->cred; if (key_validate(cred->request_key_auth) == 0) { rka = ctx->cred->request_key_auth->payload.data[0]; //// was search_process_keyrings() [ie. recursive] ctx->cred = rka->cred; key_ref = search_cred_keyrings_rcu(ctx); ctx->cred = cred; if (!IS_ERR(key_ref)) goto found; ret = key_ref; } } /* no key - decide on the error we're going to go for */ if (err == ERR_PTR(-ENOKEY) || ret == ERR_PTR(-ENOKEY)) key_ref = ERR_PTR(-ENOKEY); else if (err == ERR_PTR(-EACCES)) key_ref = ret; else key_ref = err; found: return key_ref; } /* * See if the key we're looking at is the target key. */ bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data) { return key == match_data->raw_data; } /* * Look up a key ID given us by userspace with a given permissions mask to get * the key it refers to. * * Flags can be passed to request that special keyrings be created if referred * to directly, to permit partially constructed keys to be found and to skip * validity and permission checks on the found key. * * Returns a pointer to the key with an incremented usage count if successful; * -EINVAL if the key ID is invalid; -ENOKEY if the key ID does not correspond * to a key or the best found key was a negative key; -EKEYREVOKED or * -EKEYEXPIRED if the best found key was revoked or expired; -EACCES if the * found key doesn't grant the requested permit or the LSM denied access to it; * or -ENOMEM if a special keyring couldn't be created. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, enum key_need_perm need_perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_RECURSE), }; struct request_key_auth *rka; struct key *key, *user_session; key_ref_t key_ref, skey_ref; int ret; try_again: ctx.cred = get_current_cred(); key_ref = ERR_PTR(-ENOKEY); switch (id) { case KEY_SPEC_THREAD_KEYRING: if (!ctx.cred->thread_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_thread_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->thread_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_PROCESS_KEYRING: if (!ctx.cred->process_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_process_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->process_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: if (!ctx.cred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = look_up_user_keyrings(NULL, &user_session); if (ret < 0) goto error; if (lflags & KEY_LOOKUP_CREATE) ret = join_session_keyring(NULL); else ret = install_session_keyring(user_session); key_put(user_session); if (ret < 0) goto error; goto reget_creds; } else if (test_bit(KEY_FLAG_UID_KEYRING, &ctx.cred->session_keyring->flags) && lflags & KEY_LOOKUP_CREATE) { ret = join_session_keyring(NULL); if (ret < 0) goto error; goto reget_creds; } key = ctx.cred->session_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_KEYRING: ret = look_up_user_keyrings(&key, NULL); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_SESSION_KEYRING: ret = look_up_user_keyrings(NULL, &key); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_GROUP_KEYRING: /* group keyrings are not yet supported */ key_ref = ERR_PTR(-EINVAL); goto error; case KEY_SPEC_REQKEY_AUTH_KEY: key = ctx.cred->request_key_auth; if (!key) goto error; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_REQUESTOR_KEYRING: if (!ctx.cred->request_key_auth) goto error; down_read(&ctx.cred->request_key_auth->sem); if (test_bit(KEY_FLAG_REVOKED, &ctx.cred->request_key_auth->flags)) { key_ref = ERR_PTR(-EKEYREVOKED); key = NULL; } else { rka = ctx.cred->request_key_auth->payload.data[0]; key = rka->dest_keyring; __key_get(key); } up_read(&ctx.cred->request_key_auth->sem); if (!key) goto error; key_ref = make_key_ref(key, 1); break; default: key_ref = ERR_PTR(-EINVAL); if (id < 1) goto error; key = key_lookup(id); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error; } key_ref = make_key_ref(key, 0); /* check to see if we possess the key */ ctx.index_key = key->index_key; ctx.match_data.raw_data = key; kdebug("check possessed"); rcu_read_lock(); skey_ref = search_process_keyrings_rcu(&ctx); rcu_read_unlock(); kdebug("possessed=%p", skey_ref); if (!IS_ERR(skey_ref)) { key_put(key); key_ref = skey_ref; } break; } /* unlink does not use the nominated key in any way, so can skip all * the permission checks as it is only concerned with the keyring */ if (need_perm != KEY_NEED_UNLINK) { if (!(lflags & KEY_LOOKUP_PARTIAL)) { ret = wait_for_key_construction(key, true); switch (ret) { case -ERESTARTSYS: goto invalid_key; default: if (need_perm != KEY_AUTHTOKEN_OVERRIDE && need_perm != KEY_DEFER_PERM_CHECK) goto invalid_key; case 0: break; } } else if (need_perm != KEY_DEFER_PERM_CHECK) { ret = key_validate(key); if (ret < 0) goto invalid_key; } ret = -EIO; if (!(lflags & KEY_LOOKUP_PARTIAL) && key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; } /* check the permissions */ ret = key_task_permission(key_ref, ctx.cred, need_perm); if (ret < 0) goto invalid_key; key->last_used_at = ktime_get_real_seconds(); error: put_cred(ctx.cred); return key_ref; invalid_key: key_ref_put(key_ref); key_ref = ERR_PTR(ret); goto error; /* if we attempted to install a keyring, then it may have caused new * creds to be installed */ reget_creds: put_cred(ctx.cred); goto try_again; } EXPORT_SYMBOL(lookup_user_key); /* * Join the named keyring as the session keyring if possible else attempt to * create a new one of that name and join that. * * If the name is NULL, an empty anonymous keyring will be installed as the * session keyring. * * Named session keyrings are joined with a semaphore held to prevent the * keyrings from going away whilst the attempt is made to going them and also * to prevent a race in creating compatible session keyrings. */ long join_session_keyring(const char *name) { const struct cred *old; struct cred *new; struct key *keyring; long ret, serial; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); /* if no name is provided, install an anonymous keyring */ if (!name) { ret = install_session_keyring_to_cred(new, NULL); if (ret < 0) goto error; serial = new->session_keyring->serial; ret = commit_creds(new); if (ret == 0) ret = serial; goto okay; } /* allow the user to join or create a named keyring */ mutex_lock(&key_session_mutex); /* look for an existing keyring of this name */ keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ keyring = keyring_alloc( name, old->uid, old->gid, old, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } else if (keyring == new->session_keyring) { ret = 0; goto error3; } /* we've got a keyring - now to install it */ ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) goto error3; commit_creds(new); mutex_unlock(&key_session_mutex); ret = keyring->serial; key_put(keyring); okay: return ret; error3: key_put(keyring); error2: mutex_unlock(&key_session_mutex); error: abort_creds(new); return ret; } /* * Replace a process's session keyring on behalf of one of its children when * the target process is about to resume userspace execution. */ void key_change_session_keyring(struct callback_head *twork) { const struct cred *old = current_cred(); struct cred *new = container_of(twork, struct cred, rcu); if (unlikely(current->flags & PF_EXITING)) { put_cred(new); return; } new-> uid = old-> uid; new-> euid = old-> euid; new-> suid = old-> suid; new->fsuid = old->fsuid; new-> gid = old-> gid; new-> egid = old-> egid; new-> sgid = old-> sgid; new->fsgid = old->fsgid; new->user = get_uid(old->user); new->user_ns = get_user_ns(old->user_ns); new->group_info = get_group_info(old->group_info); new->securebits = old->securebits; new->cap_inheritable = old->cap_inheritable; new->cap_permitted = old->cap_permitted; new->cap_effective = old->cap_effective; new->cap_ambient = old->cap_ambient; new->cap_bset = old->cap_bset; new->jit_keyring = old->jit_keyring; new->thread_keyring = key_get(old->thread_keyring); new->process_keyring = key_get(old->process_keyring); security_transfer_creds(new, old); commit_creds(new); } /* * Make sure that root's user and user-session keyrings exist. */ static int __init init_root_keyring(void) { return look_up_user_keyrings(NULL, NULL); } late_initcall(init_root_keyring);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TASK_WORK_H #define _LINUX_TASK_WORK_H #include <linux/list.h> #include <linux/sched.h> typedef void (*task_work_func_t)(struct callback_head *); static inline void init_task_work(struct callback_head *twork, task_work_func_t func) { twork->func = func; } enum task_work_notify_mode { TWA_NONE, TWA_RESUME, TWA_SIGNAL, }; int task_work_add(struct task_struct *task, struct callback_head *twork, enum task_work_notify_mode mode); struct callback_head *task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data); struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t); void task_work_run(void); static inline void exit_task_work(struct task_struct *task) { task_work_run(); } #endif /* _LINUX_TASK_WORK_H */
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _KERNEL_EVENTS_INTERNAL_H #define _KERNEL_EVENTS_INTERNAL_H #include <linux/hardirq.h> #include <linux/uaccess.h> #include <linux/refcount.h> /* Buffer handling */ #define RING_BUFFER_WRITABLE 0x01 struct perf_buffer { refcount_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ #endif int nr_pages; /* nr of data pages */ int overwrite; /* can overwrite itself */ int paused; /* can write into ring buffer */ atomic_t poll; /* POLL_ for wakeups */ local_t head; /* write position */ unsigned int nest; /* nested writers */ local_t events; /* event limit */ local_t wakeup; /* wakeup stamp */ local_t lost; /* nr records lost */ long watermark; /* wakeup watermark */ long aux_watermark; /* poll crap */ spinlock_t event_lock; struct list_head event_list; atomic_t mmap_count; unsigned long mmap_locked; struct user_struct *mmap_user; /* AUX area */ long aux_head; unsigned int aux_nest; long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; atomic_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; int aux_in_sampling; void **aux_pages; void *aux_priv; struct perf_event_mmap_page *user_page; void *data_pages[]; }; extern void rb_free(struct perf_buffer *rb); static inline void rb_free_rcu(struct rcu_head *rcu_head) { struct perf_buffer *rb; rb = container_of(rcu_head, struct perf_buffer, rcu_head); rb_free(rb); } static inline void rb_toggle_paused(struct perf_buffer *rb, bool pause) { if (!pause && rb->nr_pages) rb->paused = 0; else rb->paused = 1; } extern struct perf_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); extern int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags); extern void rb_free_aux(struct perf_buffer *rb); extern struct perf_buffer *ring_buffer_get(struct perf_event *event); extern void ring_buffer_put(struct perf_buffer *rb); static inline bool rb_has_aux(struct perf_buffer *rb) { return !!rb->aux_nr_pages; } void perf_event_aux_event(struct perf_event *event, unsigned long head, unsigned long size, u64 flags); extern struct page * perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff); #ifdef CONFIG_PERF_USE_VMALLOC /* * Back perf_mmap() with vmalloc memory. * * Required for architectures that have d-cache aliasing issues. */ static inline int page_order(struct perf_buffer *rb) { return rb->page_order; } #else static inline int page_order(struct perf_buffer *rb) { return 0; } #endif static inline int data_page_nr(struct perf_buffer *rb) { return rb->nr_pages << page_order(rb); } static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } static inline unsigned long perf_aux_size(struct perf_buffer *rb) { return rb->aux_nr_pages << PAGE_SHIFT; } #define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \ { \ unsigned long size, written; \ \ do { \ size = min(handle->size, len); \ written = memcpy_func(__VA_ARGS__); \ written = size - written; \ \ len -= written; \ handle->addr += written; \ if (advance_buf) \ buf += written; \ handle->size -= written; \ if (!handle->size) { \ struct perf_buffer *rb = handle->rb; \ \ handle->page++; \ handle->page &= rb->nr_pages - 1; \ handle->addr = rb->data_pages[handle->page]; \ handle->size = PAGE_SIZE << page_order(rb); \ } \ } while (len && written == size); \ \ return len; \ } #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ static inline unsigned long \ func_name(struct perf_output_handle *handle, \ const void *buf, unsigned long len) \ __DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size) static inline unsigned long __output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, const void *buf, unsigned long len) { unsigned long orig_len = len; __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf, orig_len - len, size) } static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n) { memcpy(dst, src, n); return 0; } DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) static inline unsigned long memcpy_skip(void *dst, const void *src, unsigned long n) { return 0; } DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip) #ifndef arch_perf_out_copy_user #define arch_perf_out_copy_user arch_perf_out_copy_user static inline unsigned long arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) { unsigned long ret; pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, n); pagefault_enable(); return ret; } #endif DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(int *recursion) { unsigned int pc = preempt_count(); unsigned char rctx = 0; rctx += !!(pc & (NMI_MASK)); rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); if (recursion[rctx]) return -1; recursion[rctx]++; barrier(); return rctx; } static inline void put_recursion_context(int *recursion, int rctx) { barrier(); recursion[rctx]--; } #ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP static inline bool arch_perf_have_user_stack_dump(void) { return true; } #define perf_user_stack_pointer(regs) user_stack_pointer(regs) #else static inline bool arch_perf_have_user_stack_dump(void) { return false; } #define perf_user_stack_pointer(regs) 0 #endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */ #endif /* _KERNEL_EVENTS_INTERNAL_H */
4 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/capability.c * * Copyright (C) 1997 Andrew Main <zefram@fysh.org> * * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org> * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/audit.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> /* * Leveraged for setting/resetting capabilities */ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; static int __init file_caps_disable(char *str) { file_caps_enabled = 0; return 1; } __setup("no_file_caps", file_caps_disable); #ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * * http://www.kernel.org/pub/linux/libs/security/linux-privs/ */ static void warn_legacy_capability_use(void) { char name[sizeof(current->comm)]; pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", get_task_comm(name, current)); } /* * Version 2 capabilities worked fine, but the linux/capability.h file * that accompanied their introduction encouraged their use without * the necessary user-space source code changes. As such, we have * created a version 3 with equivalent functionality to version 2, but * with a header change to protect legacy source code from using * version 2 when it wanted to use version 1. If your system has code * that trips the following warning, it is using version 2 specific * capabilities and may be doing so insecurely. * * The remedy is to either upgrade your version of libcap (to 2.10+, * if the application is linked against it), or recompile your * application with modern kernel headers and this warning will go * away. */ static void warn_deprecated_v2(void) { char name[sizeof(current->comm)]; pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", get_task_comm(name, current)); } /* * Version check. Return the number of u32s in each capability flag * array, or a negative value on error. */ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) { __u32 version; if (get_user(version, &header->version)) return -EFAULT; switch (version) { case _LINUX_CAPABILITY_VERSION_1: warn_legacy_capability_use(); *tocopy = _LINUX_CAPABILITY_U32S_1; break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); fallthrough; /* v3 is otherwise equivalent to v2 */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; default: if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) return -EFAULT; return -EINVAL; } return 0; } /* * The only thing that can change the capabilities of the current * process is the current process. As such, we can't be in this code * at the same time as we are in the process of setting capabilities * in this process. The net result is that we can limit our use of * locks to when we are reading the caps of another process. */ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, kernel_cap_t *pIp, kernel_cap_t *pPp) { int ret; if (pid && (pid != task_pid_vnr(current))) { struct task_struct *target; rcu_read_lock(); target = find_task_by_vpid(pid); if (!target) ret = -ESRCH; else ret = security_capget(target, pEp, pIp, pPp); rcu_read_unlock(); } else ret = security_capget(current, pEp, pIp, pPp); return ret; } /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and * target pid data * @dataptr: pointer to struct that contains the effective, permitted, * and inheritable capabilities that are returned * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) { int ret = 0; pid_t pid; unsigned tocopy; kernel_cap_t pE, pI, pP; ret = cap_validate_magic(header, &tocopy); if ((dataptr == NULL) || (ret != 0)) return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; if (pid < 0) return -EINVAL; ret = cap_get_target_pid(pid, &pE, &pI, &pP); if (!ret) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i; for (i = 0; i < tocopy; i++) { kdata[i].effective = pE.cap[i]; kdata[i].permitted = pP.cap[i]; kdata[i].inheritable = pI.cap[i]; } /* * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability * bits when they perform a: capget/modify/capset * sequence. * * This behavior is considered fail-safe * behavior. Upgrading the application to a newer * version of libcap will enable access to the newer * capabilities. * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ if (copy_to_user(dataptr, kdata, tocopy * sizeof(struct __user_cap_data_struct))) { return -EFAULT; } } return ret; } /** * sys_capset - set capabilities for a process or (*) a group of processes * @header: pointer to struct that contains capability version and * target pid data * @data: pointer to struct that contains the effective, permitted, * and inheritable capabilities * * Set capabilities for the current process only. The ability to any other * process(es) has been deprecated and removed. * * The restrictions on setting capabilities are specified as: * * I: any raised capabilities must be a subset of the old permitted * P: any raised capabilities must be a subset of the old permitted * E: must be set to a subset of new permitted * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; pid_t pid; ret = cap_validate_magic(header, &tocopy); if (ret != 0) return ret; if (get_user(pid, &header->pid)) return -EFAULT; /* may only affect current now */ if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; copybytes = tocopy * sizeof(struct __user_cap_data_struct); if (copybytes > sizeof(kdata)) return -EFAULT; if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; for (i = 0; i < tocopy; i++) { effective.cap[i] = kdata[i].effective; permitted.cap[i] = kdata[i].permitted; inheritable.cap[i] = kdata[i].inheritable; } while (i < _KERNEL_CAPABILITY_U32S) { effective.cap[i] = 0; permitted.cap[i] = 0; inheritable.cap[i] = 0; i++; } effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; new = prepare_creds(); if (!new) return -ENOMEM; ret = security_capset(new, current_cred(), &effective, &inheritable, &permitted); if (ret < 0) goto error; audit_log_capset(new, current_cred()); return commit_creds(new); error: abort_creds(new); return ret; } /** * has_ns_capability - Does a task have a capability in a specific user ns * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); } /** * has_capability - Does a task have a capability in init_user_ns * @t: The task in question * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the initial user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_capability(struct task_struct *t, int cap) { return has_ns_capability(t, &init_user_ns, cap); } EXPORT_SYMBOL(has_capability); /** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * Do not write an audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } /** * has_capability_noaudit - Does a task have a capability (unaudited) in the * initial user ns * @t: The task in question * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to init_user_ns, false if not. Don't write an * audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } static bool ns_capable_common(struct user_namespace *ns, int cap, unsigned int opts) { int capable; if (unlikely(!cap_valid(cap))) { pr_crit("capable() called with invalid cap=%u\n", cap); BUG(); } capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; } return false; } /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); /** * ns_capable_noaudit - Determine if the current task has a superior capability * (unaudited) in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** * ns_capable_setid - Determine if the current task has a superior capability * in effect, while signalling that this check is being done from within a * setid or setgroups syscall. * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_setid(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_INSETID); } EXPORT_SYMBOL(ns_capable_setid); /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool capable(int cap) { return ns_capable(&init_user_ns, cap); } EXPORT_SYMBOL(capable); #endif /* CONFIG_MULTIUSER */ /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if task that opened the file had a capability in effect * when the file was opened. * * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; } EXPORT_SYMBOL(file_ns_capable); /** * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? * @ns: The user namespace in question * @inode: The inode in question * * Return true if the inode uid and gid are within the namespace. */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode) { return kuid_has_mapping(ns, inode->i_uid) && kgid_has_mapping(ns, inode->i_gid); } /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question * @cap: The capability in question * * Return true if the current task has the given capability targeted at * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); /** * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace * @tsk: The task that may be ptraced * @ns: The user namespace to search for CAP_SYS_PTRACE in * * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE * in the specified user namespace. */ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 /* SPDX-License-Identifier: GPL-2.0 */ /* * This is <linux/capability.h> * * Andrew G. Morgan <morgan@kernel.org> * Alexander Kjeldaas <astor@guardian.no> * with help from Aleph1, Roland Buresund and Andrew Main. * * See here for the libcap library ("POSIX draft" compliance): * * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ */ #ifndef _LINUX_CAPABILITY_H #define _LINUX_CAPABILITY_H #include <uapi/linux/capability.h> #include <linux/uidgid.h> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 #define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3 extern int file_caps_enabled; typedef struct kernel_cap_struct { __u32 cap[_KERNEL_CAPABILITY_U32S]; } kernel_cap_t; /* same as vfs_ns_cap_data but in cpu endian and always filled completely */ struct cpu_vfs_cap_data { __u32 magic_etc; kernel_cap_t permitted; kernel_cap_t inheritable; kuid_t rootid; }; #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) struct file; struct inode; struct dentry; struct task_struct; struct user_namespace; extern const kernel_cap_t __cap_empty_set; extern const kernel_cap_t __cap_init_eff_set; /* * Internal kernel functions only */ #define CAP_FOR_EACH_U32(__capi) \ for (__capi = 0; __capi < _KERNEL_CAPABILITY_U32S; ++__capi) /* * CAP_FS_MASK and CAP_NFSD_MASKS: * * The fs mask is all the privileges that fsuid==0 historically meant. * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE. * * It has never meant setting security.* and trusted.* xattrs. * * We could also define fsmask as follows: * 1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions * 2. The security.* and trusted.* xattrs are fs-related MAC permissions */ # define CAP_FS_MASK_B0 (CAP_TO_MASK(CAP_CHOWN) \ | CAP_TO_MASK(CAP_MKNOD) \ | CAP_TO_MASK(CAP_DAC_OVERRIDE) \ | CAP_TO_MASK(CAP_DAC_READ_SEARCH) \ | CAP_TO_MASK(CAP_FOWNER) \ | CAP_TO_MASK(CAP_FSETID)) # define CAP_FS_MASK_B1 (CAP_TO_MASK(CAP_MAC_OVERRIDE)) #if _KERNEL_CAPABILITY_U32S != 2 # error Fix up hand-coded capability macro initializers #else /* HAND-CODED capability initializers */ #define CAP_LAST_U32 ((_KERNEL_CAPABILITY_U32S) - 1) #define CAP_LAST_U32_VALID_MASK (CAP_TO_MASK(CAP_LAST_CAP + 1) -1) # define CAP_EMPTY_SET ((kernel_cap_t){{ 0, 0 }}) # define CAP_FULL_SET ((kernel_cap_t){{ ~0, CAP_LAST_U32_VALID_MASK }}) # define CAP_FS_SET ((kernel_cap_t){{ CAP_FS_MASK_B0 \ | CAP_TO_MASK(CAP_LINUX_IMMUTABLE), \ CAP_FS_MASK_B1 } }) # define CAP_NFSD_SET ((kernel_cap_t){{ CAP_FS_MASK_B0 \ | CAP_TO_MASK(CAP_SYS_RESOURCE), \ CAP_FS_MASK_B1 } }) #endif /* _KERNEL_CAPABILITY_U32S != 2 */ # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) #define cap_lower(c, flag) ((c).cap[CAP_TO_INDEX(flag)] &= ~CAP_TO_MASK(flag)) #define cap_raised(c, flag) ((c).cap[CAP_TO_INDEX(flag)] & CAP_TO_MASK(flag)) #define CAP_BOP_ALL(c, a, b, OP) \ do { \ unsigned __capi; \ CAP_FOR_EACH_U32(__capi) { \ c.cap[__capi] = a.cap[__capi] OP b.cap[__capi]; \ } \ } while (0) #define CAP_UOP_ALL(c, a, OP) \ do { \ unsigned __capi; \ CAP_FOR_EACH_U32(__capi) { \ c.cap[__capi] = OP a.cap[__capi]; \ } \ } while (0) static inline kernel_cap_t cap_combine(const kernel_cap_t a, const kernel_cap_t b) { kernel_cap_t dest; CAP_BOP_ALL(dest, a, b, |); return dest; } static inline kernel_cap_t cap_intersect(const kernel_cap_t a, const kernel_cap_t b) { kernel_cap_t dest; CAP_BOP_ALL(dest, a, b, &); return dest; } static inline kernel_cap_t cap_drop(const kernel_cap_t a, const kernel_cap_t drop) { kernel_cap_t dest; CAP_BOP_ALL(dest, a, drop, &~); return dest; } static inline kernel_cap_t cap_invert(const kernel_cap_t c) { kernel_cap_t dest; CAP_UOP_ALL(dest, c, ~); return dest; } static inline bool cap_isclear(const kernel_cap_t a) { unsigned __capi; CAP_FOR_EACH_U32(__capi) { if (a.cap[__capi] != 0) return false; } return true; } /* * Check if "a" is a subset of "set". * return true if ALL of the capabilities in "a" are also in "set" * cap_issubset(0101, 1111) will return true * return false if ANY of the capabilities in "a" are not in "set" * cap_issubset(1111, 0101) will return false */ static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set) { kernel_cap_t dest; dest = cap_drop(a, set); return cap_isclear(dest); } /* Used to decide between falling back on the old suser() or fsuser(). */ static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a) { const kernel_cap_t __cap_fs_set = CAP_FS_SET; return cap_drop(a, __cap_fs_set); } static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a, const kernel_cap_t permitted) { const kernel_cap_t __cap_fs_set = CAP_FS_SET; return cap_combine(a, cap_intersect(permitted, __cap_fs_set)); } static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a) { const kernel_cap_t __cap_fs_set = CAP_NFSD_SET; return cap_drop(a, __cap_fs_set); } static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, const kernel_cap_t permitted) { const kernel_cap_t __cap_nfsd_set = CAP_NFSD_SET; return cap_combine(a, cap_intersect(permitted, __cap_nfsd_set)); } #ifdef CONFIG_MULTIUSER extern bool has_capability(struct task_struct *t, int cap); extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); extern bool ns_capable_setid(struct user_namespace *ns, int cap); #else static inline bool has_capability(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool has_capability_noaudit(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool capable(int cap) { return true; } static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_setid(struct user_namespace *ns, int cap) { return true; } #endif /* CONFIG_MULTIUSER */ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode); extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); static inline bool perfmon_capable(void) { return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); } static inline bool bpf_capable(void) { return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); } static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) { return ns_capable(ns, CAP_CHECKPOINT_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN); } /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); #endif /* !_LINUX_CAPABILITY_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-fallback.sh // DO NOT MODIFY THIS FILE DIRECTLY #ifndef _LINUX_ATOMIC_FALLBACK_H #define _LINUX_ATOMIC_FALLBACK_H #include <linux/compiler.h> #ifndef arch_xchg_relaxed #define arch_xchg_relaxed arch_xchg #define arch_xchg_acquire arch_xchg #define arch_xchg_release arch_xchg #else /* arch_xchg_relaxed */ #ifndef arch_xchg_acquire #define arch_xchg_acquire(...) \ __atomic_op_acquire(arch_xchg, __VA_ARGS__) #endif #ifndef arch_xchg_release #define arch_xchg_release(...) \ __atomic_op_release(arch_xchg, __VA_ARGS__) #endif #ifndef arch_xchg #define arch_xchg(...) \ __atomic_op_fence(arch_xchg, __VA_ARGS__) #endif #endif /* arch_xchg_relaxed */ #ifndef arch_cmpxchg_relaxed #define arch_cmpxchg_relaxed arch_cmpxchg #define arch_cmpxchg_acquire arch_cmpxchg #define arch_cmpxchg_release arch_cmpxchg #else /* arch_cmpxchg_relaxed */ #ifndef arch_cmpxchg_acquire #define arch_cmpxchg_acquire(...) \ __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__) #endif #ifndef arch_cmpxchg_release #define arch_cmpxchg_release(...) \ __atomic_op_release(arch_cmpxchg, __VA_ARGS__) #endif #ifndef arch_cmpxchg #define arch_cmpxchg(...) \ __atomic_op_fence(arch_cmpxchg, __VA_ARGS__) #endif #endif /* arch_cmpxchg_relaxed */ #ifndef arch_cmpxchg64_relaxed #define arch_cmpxchg64_relaxed arch_cmpxchg64 #define arch_cmpxchg64_acquire arch_cmpxchg64 #define arch_cmpxchg64_release arch_cmpxchg64 #else /* arch_cmpxchg64_relaxed */ #ifndef arch_cmpxchg64_acquire #define arch_cmpxchg64_acquire(...) \ __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__) #endif #ifndef arch_cmpxchg64_release #define arch_cmpxchg64_release(...) \ __atomic_op_release(arch_cmpxchg64, __VA_ARGS__) #endif #ifndef arch_cmpxchg64 #define arch_cmpxchg64(...) \ __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__) #endif #endif /* arch_cmpxchg64_relaxed */ #ifndef arch_atomic_read_acquire static __always_inline int arch_atomic_read_acquire(const atomic_t *v) { return smp_load_acquire(&(v)->counter); } #define arch_atomic_read_acquire arch_atomic_read_acquire #endif #ifndef arch_atomic_set_release static __always_inline void arch_atomic_set_release(atomic_t *v, int i) { smp_store_release(&(v)->counter, i); } #define arch_atomic_set_release arch_atomic_set_release #endif #ifndef arch_atomic_add_return_relaxed #define arch_atomic_add_return_acquire arch_atomic_add_return #define arch_atomic_add_return_release arch_atomic_add_return #define arch_atomic_add_return_relaxed arch_atomic_add_return #else /* arch_atomic_add_return_relaxed */ #ifndef arch_atomic_add_return_acquire static __always_inline int arch_atomic_add_return_acquire(int i, atomic_t *v) { int ret = arch_atomic_add_return_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic_add_return_acquire arch_atomic_add_return_acquire #endif #ifndef arch_atomic_add_return_release static __always_inline int arch_atomic_add_return_release(int i, atomic_t *v) { __atomic_release_fence(); return arch_atomic_add_return_relaxed(i, v); } #define arch_atomic_add_return_release arch_atomic_add_return_release #endif #ifndef arch_atomic_add_return static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_add_return_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic_add_return arch_atomic_add_return #endif #endif /* arch_atomic_add_return_relaxed */ #ifndef arch_atomic_fetch_add_relaxed #define arch_atomic_fetch_add_acquire arch_atomic_fetch_add #define arch_atomic_fetch_add_release arch_atomic_fetch_add #define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add #else /* arch_atomic_fetch_add_relaxed */ #ifndef arch_atomic_fetch_add_acquire static __always_inline int arch_atomic_fetch_add_acquire(int i, atomic_t *v) { int ret = arch_atomic_fetch_add_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic_fetch_add_acquire arch_atomic_fetch_add_acquire #endif #ifndef arch_atomic_fetch_add_release static __always_inline int arch_atomic_fetch_add_release(int i, atomic_t *v) { __atomic_release_fence(); return arch_atomic_fetch_add_relaxed(i, v); } #define arch_atomic_fetch_add_release arch_atomic_fetch_add_release #endif #ifndef arch_atomic_fetch_add static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_add_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic_fetch_add arch_atomic_fetch_add #endif #endif /* arch_atomic_fetch_add_relaxed */ #ifndef arch_atomic_sub_return_relaxed #define arch_atomic_sub_return_acquire arch_atomic_sub_return #define arch_atomic_sub_return_release arch_atomic_sub_return #define arch_atomic_sub_return_relaxed arch_atomic_sub_return #else /* arch_atomic_sub_return_relaxed */ #ifndef arch_atomic_sub_return_acquire static __always_inline int arch_atomic_sub_return_acquire(int i, atomic_t *v) { int ret = arch_atomic_sub_return_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic_sub_return_acquire arch_atomic_sub_return_acquire #endif #ifndef arch_atomic_sub_return_release static __always_inline int arch_atomic_sub_return_release(int i, atomic_t *v) { __atomic_release_fence(); return arch_atomic_sub_return_relaxed(i, v); } #define arch_atomic_sub_return_release arch_atomic_sub_return_release #endif #ifndef arch_atomic_sub_return static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_sub_return_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic_sub_return arch_atomic_sub_return #endif #endif /* arch_atomic_sub_return_relaxed */ #ifndef arch_atomic_fetch_sub_relaxed #define arch_atomic_fetch_sub_acquire arch_atomic_fetch_sub #define arch_atomic_fetch_sub_release arch_atomic_fetch_sub #define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub #else /* arch_atomic_fetch_sub_relaxed */ #ifndef arch_atomic_fetch_sub_acquire static __always_inline int arch_atomic_fetch_sub_acquire(int i, atomic_t *v) { int ret = arch_atomic_fetch_sub_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic_fetch_sub_acquire arch_atomic_fetch_sub_acquire #endif #ifndef arch_atomic_fetch_sub_release static __always_inline int arch_atomic_fetch_sub_release(int i, atomic_t *v) { __atomic_release_fence(); return arch_atomic_fetch_sub_relaxed(i, v); } #define arch_atomic_fetch_sub_release arch_atomic_fetch_sub_release #endif #ifndef arch_atomic_fetch_sub static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_sub_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic_fetch_sub arch_atomic_fetch_sub #endif #endif /* arch_atomic_fetch_sub_relaxed */ #ifndef arch_atomic_inc static __always_inline void arch_atomic_inc(atomic_t *v) { arch_atomic_add(1, v); } #define arch_atomic_inc arch_atomic_inc #endif #ifndef arch_atomic_inc_return_relaxed #ifdef arch_atomic_inc_return #define arch_atomic_inc_return_acquire arch_atomic_inc_return #define arch_atomic_inc_return_release arch_atomic_inc_return #define arch_atomic_inc_return_relaxed arch_atomic_inc_return #endif /* arch_atomic_inc_return */ #ifndef arch_atomic_inc_return static __always_inline int arch_atomic_inc_return(atomic_t *v) { return arch_atomic_add_return(1, v); } #define arch_atomic_inc_return arch_atomic_inc_return #endif #ifndef arch_atomic_inc_return_acquire static __always_inline int arch_atomic_inc_return_acquire(atomic_t *v) { return arch_atomic_add_return_acquire(1, v); } #define arch_atomic_inc_return_acquire arch_atomic_inc_return_acquire #endif #ifndef arch_atomic_inc_return_release static __always_inline int arch_atomic_inc_return_release(atomic_t *v) { return arch_atomic_add_return_release(1, v); } #define arch_atomic_inc_return_release arch_atomic_inc_return_release #endif #ifndef arch_atomic_inc_return_relaxed static __always_inline int arch_atomic_inc_return_relaxed(atomic_t *v) { return arch_atomic_add_return_relaxed(1, v); } #define arch_atomic_inc_return_relaxed arch_atomic_inc_return_relaxed #endif #else /* arch_atomic_inc_return_relaxed */ #ifndef arch_atomic_inc_return_acquire static __always_inline int arch_atomic_inc_return_acquire(atomic_t *v) { int ret = arch_atomic_inc_return_relaxed(v); __atomic_acquire_fence(); return ret; } #define arch_atomic_inc_return_acquire arch_atomic_inc_return_acquire #endif #ifndef arch_atomic_inc_return_release static __always_inline int arch_atomic_inc_return_release(atomic_t *v) { __atomic_release_fence(); return arch_atomic_inc_return_relaxed(v); } #define arch_atomic_inc_return_release arch_atomic_inc_return_release #endif #ifndef arch_atomic_inc_return static __always_inline int arch_atomic_inc_return(atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_inc_return_relaxed(v); __atomic_post_full_fence(); return ret; } #define arch_atomic_inc_return arch_atomic_inc_return #endif #endif /* arch_atomic_inc_return_relaxed */ #ifndef arch_atomic_fetch_inc_relaxed #ifdef arch_atomic_fetch_inc #define arch_atomic_fetch_inc_acquire arch_atomic_fetch_inc #define arch_atomic_fetch_inc_release arch_atomic_fetch_inc #define arch_atomic_fetch_inc_relaxed arch_atomic_fetch_inc #endif /* arch_atomic_fetch_inc */ #ifndef arch_atomic_fetch_inc static __always_inline int arch_atomic_fetch_inc(atomic_t *v) { return arch_atomic_fetch_add(1, v); } #define arch_atomic_fetch_inc arch_atomic_fetch_inc #endif #ifndef arch_atomic_fetch_inc_acquire static __always_inline int arch_atomic_fetch_inc_acquire(atomic_t *v) { return arch_atomic_fetch_add_acquire(1, v); } #define arch_atomic_fetch_inc_acquire arch_atomic_fetch_inc_acquire #endif #ifndef arch_atomic_fetch_inc_release static __always_inline int arch_atomic_fetch_inc_release(atomic_t *v) { return arch_atomic_fetch_add_release(1, v); } #define arch_atomic_fetch_inc_release arch_atomic_fetch_inc_release #endif #ifndef arch_atomic_fetch_inc_relaxed static __always_inline int arch_atomic_fetch_inc_relaxed(atomic_t *v) { return arch_atomic_fetch_add_relaxed(1, v); } #define arch_atomic_fetch_inc_relaxed arch_atomic_fetch_inc_relaxed #endif #else /* arch_atomic_fetch_inc_relaxed */ #ifndef arch_atomic_fetch_inc_acquire static __always_inline int arch_atomic_fetch_inc_acquire(atomic_t *v) { int ret = arch_atomic_fetch_inc_relaxed(v); __atomic_acquire_fence(); return ret; } #define arch_atomic_fetch_inc_acquire arch_atomic_fetch_inc_acquire #endif #ifndef arch_atomic_fetch_inc_release static __always_inline int arch_atomic_fetch_inc_release(atomic_t *v) { __atomic_release_fence(); return arch_atomic_fetch_inc_relaxed(v); } #define arch_atomic_fetch_inc_release arch_atomic_fetch_inc_release #endif #ifndef arch_atomic_fetch_inc static __always_inline int arch_atomic_fetch_inc(atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_inc_relaxed(v); __atomic_post_full_fence(); return ret; } #define arch_atomic_fetch_inc arch_atomic_fetch_inc #endif #endif /* arch_atomic_fetch_inc_relaxed */ #ifndef arch_atomic_dec static __always_inline void arch_atomic_dec(atomic_t *v) { arch_atomic_sub(1, v); } #define arch_atomic_dec arch_atomic_dec #endif #ifndef arch_atomic_dec_return_relaxed #ifdef arch_atomic_dec_return #define arch_atomic_dec_return_acquire arch_atomic_dec_return #define arch_atomic_dec_return_release arch_atomic_dec_return #define arch_atomic_dec_return_relaxed arch_atomic_dec_return #endif /* arch_atomic_dec_return */ #ifndef arch_atomic_dec_return static __always_inline int arch_atomic_dec_return(atomic_t *v) { return arch_atomic_sub_return(1, v); } #define arch_atomic_dec_return arch_atomic_dec_return #endif #ifndef arch_atomic_dec_return_acquire static __always_inline int arch_atomic_dec_return_acquire(atomic_t *v) { return arch_atomic_sub_return_acquire(1, v); } #define arch_atomic_dec_return_acquire arch_atomic_dec_return_acquire #endif #ifndef arch_atomic_dec_return_release static __always_inline int arch_atomic_dec_return_release(atomic_t *v) { return arch_atomic_sub_return_release(1, v); } #define arch_atomic_dec_return_release arch_atomic_dec_return_release #endif #ifndef arch_atomic_dec_return_relaxed static __always_inline int arch_atomic_dec_return_relaxed(atomic_t *v) { return arch_atomic_sub_return_relaxed(1, v); } #define arch_atomic_dec_return_relaxed arch_atomic_dec_return_relaxed #endif #else /* arch_atomic_dec_return_relaxed */ #ifndef arch_atomic_dec_return_acquire static __always_inline int arch_atomic_dec_return_acquire(atomic_t *v) { int ret = arch_atomic_dec_return_relaxed(v); __atomic_acquire_fence(); return ret; } #define arch_atomic_dec_return_acquire arch_atomic_dec_return_acquire #endif #ifndef arch_atomic_dec_return_release static __always_inline int arch_atomic_dec_return_release(atomic_t *v) { __atomic_release_fence(); return arch_atomic_dec_return_relaxed(v); } #define arch_atomic_dec_return_release arch_atomic_dec_return_release #endif #ifndef arch_atomic_dec_return static __always_inline int arch_atomic_dec_return(atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_dec_return_relaxed(v); __atomic_post_full_fence(); return ret; } #define arch_atomic_dec_return arch_atomic_dec_return #endif #endif /* arch_atomic_dec_return_relaxed */ #ifndef arch_atomic_fetch_dec_relaxed #ifdef arch_atomic_fetch_dec #define arch_atomic_fetch_dec_acquire arch_atomic_fetch_dec #define arch_atomic_fetch_dec_release arch_atomic_fetch_dec #define arch_atomic_fetch_dec_relaxed arch_atomic_fetch_dec #endif /* arch_atomic_fetch_dec */ #ifndef arch_atomic_fetch_dec static __always_inline int arch_atomic_fetch_dec(atomic_t *v) { return arch_atomic_fetch_sub(1, v); } #define arch_atomic_fetch_dec arch_atomic_fetch_dec #endif #ifndef arch_atomic_fetch_dec_acquire static __always_inline int arch_atomic_fetch_dec_acquire(atomic_t *v) { return arch_atomic_fetch_sub_acquire(1, v); } #define arch_atomic_fetch_dec_acquire arch_atomic_fetch_dec_acquire #endif #ifndef arch_atomic_fetch_dec_release static __always_inline int arch_atomic_fetch_dec_release(atomic_t *v) { return arch_atomic_fetch_sub_release(1, v); } #define arch_atomic_fetch_dec_release arch_atomic_fetch_dec_release #endif #ifndef arch_atomic_fetch_dec_relaxed static __always_inline int arch_atomic_fetch_dec_relaxed(atomic_t *v) { return arch_atomic_fetch_sub_relaxed(1, v); } #define arch_atomic_fetch_dec_relaxed arch_atomic_fetch_dec_relaxed #endif #else /* arch_atomic_fetch_dec_relaxed */ #ifndef arch_atomic_fetch_dec_acquire static __always_inline int arch_atomic_fetch_dec_acquire(atomic_t *v) { int ret = arch_atomic_fetch_dec_relaxed(v); __atomic_acquire_fence(); return ret; } #define arch_atomic_fetch_dec_acquire arch_atomic_fetch_dec_acquire #endif #ifndef arch_atomic_fetch_dec_release static __always_inline int arch_atomic_fetch_dec_release(atomic_t *v) { __atomic_release_fence(); return arch_atomic_fetch_dec_relaxed(v); } #define arch_atomic_fetch_dec_release arch_atomic_fetch_dec_release #endif #ifndef arch_atomic_fetch_dec static __always_inline int arch_atomic_fetch_dec(atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_dec_relaxed(v); __atomic_post_full_fence(); return ret; } #define arch_atomic_fetch_dec arch_atomic_fetch_dec #endif #endif /* arch_atomic_fetch_dec_relaxed */ #ifndef arch_atomic_fetch_and_relaxed #define arch_atomic_fetch_and_acquire arch_atomic_fetch_and #define arch_atomic_fetch_and_release arch_atomic_fetch_and #define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and #else /* arch_atomic_fetch_and_relaxed */ #ifndef arch_atomic_fetch_and_acquire static __always_inline int arch_atomic_fetch_and_acquire(int i, atomic_t *v) { int ret = arch_atomic_fetch_and_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic_fetch_and_acquire arch_atomic_fetch_and_acquire #endif #ifndef arch_atomic_fetch_and_release static __always_inline int arch_atomic_fetch_and_release(int i, atomic_t *v) { __atomic_release_fence(); return arch_atomic_fetch_and_relaxed(i, v); } #define arch_atomic_fetch_and_release arch_atomic_fetch_and_release #endif #ifndef arch_atomic_fetch_and static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_and_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic_fetch_and arch_atomic_fetch_and #endif #endif /* arch_atomic_fetch_and_relaxed */ #ifndef arch_atomic_andnot static __always_inline void arch_atomic_andnot(int i, atomic_t *v) { arch_atomic_and(~i, v); } #define arch_atomic_andnot arch_atomic_andnot #endif #ifndef arch_atomic_fetch_andnot_relaxed #ifdef arch_atomic_fetch_andnot #define arch_atomic_fetch_andnot_acquire arch_atomic_fetch_andnot #define arch_atomic_fetch_andnot_release arch_atomic_fetch_andnot #define arch_atomic_fetch_andnot_relaxed arch_atomic_fetch_andnot #endif /* arch_atomic_fetch_andnot */ #ifndef arch_atomic_fetch_andnot static __always_inline int arch_atomic_fetch_andnot(int i, atomic_t *v) { return arch_atomic_fetch_and(~i, v); } #define arch_atomic_fetch_andnot arch_atomic_fetch_andnot #endif #ifndef arch_atomic_fetch_andnot_acquire static __always_inline int arch_atomic_fetch_andnot_acquire(int i, atomic_t *v) { return arch_atomic_fetch_and_acquire(~i, v); } #define arch_atomic_fetch_andnot_acquire arch_atomic_fetch_andnot_acquire #endif #ifndef arch_atomic_fetch_andnot_release static __always_inline int arch_atomic_fetch_andnot_release(int i, atomic_t *v) { return arch_atomic_fetch_and_release(~i, v); } #define arch_atomic_fetch_andnot_release arch_atomic_fetch_andnot_release #endif #ifndef arch_atomic_fetch_andnot_relaxed static __always_inline int arch_atomic_fetch_andnot_relaxed(int i, atomic_t *v) { return arch_atomic_fetch_and_relaxed(~i, v); } #define arch_atomic_fetch_andnot_relaxed arch_atomic_fetch_andnot_relaxed #endif #else /* arch_atomic_fetch_andnot_relaxed */ #ifndef arch_atomic_fetch_andnot_acquire static __always_inline int arch_atomic_fetch_andnot_acquire(int i, atomic_t *v) { int ret = arch_atomic_fetch_andnot_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic_fetch_andnot_acquire arch_atomic_fetch_andnot_acquire #endif #ifndef arch_atomic_fetch_andnot_release static __always_inline int arch_atomic_fetch_andnot_release(int i, atomic_t *v) { __atomic_release_fence(); return arch_atomic_fetch_andnot_relaxed(i, v); } #define arch_atomic_fetch_andnot_release arch_atomic_fetch_andnot_release #endif #ifndef arch_atomic_fetch_andnot static __always_inline int arch_atomic_fetch_andnot(int i, atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_andnot_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic_fetch_andnot arch_atomic_fetch_andnot #endif #endif /* arch_atomic_fetch_andnot_relaxed */ #ifndef arch_atomic_fetch_or_relaxed #define arch_atomic_fetch_or_acquire arch_atomic_fetch_or #define arch_atomic_fetch_or_release arch_atomic_fetch_or #define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or #else /* arch_atomic_fetch_or_relaxed */ #ifndef arch_atomic_fetch_or_acquire static __always_inline int arch_atomic_fetch_or_acquire(int i, atomic_t *v) { int ret = arch_atomic_fetch_or_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic_fetch_or_acquire arch_atomic_fetch_or_acquire #endif #ifndef arch_atomic_fetch_or_release static __always_inline int arch_atomic_fetch_or_release(int i, atomic_t *v) { __atomic_release_fence(); return arch_atomic_fetch_or_relaxed(i, v); } #define arch_atomic_fetch_or_release arch_atomic_fetch_or_release #endif #ifndef arch_atomic_fetch_or static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_or_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic_fetch_or arch_atomic_fetch_or #endif #endif /* arch_atomic_fetch_or_relaxed */ #ifndef arch_atomic_fetch_xor_relaxed #define arch_atomic_fetch_xor_acquire arch_atomic_fetch_xor #define arch_atomic_fetch_xor_release arch_atomic_fetch_xor #define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor #else /* arch_atomic_fetch_xor_relaxed */ #ifndef arch_atomic_fetch_xor_acquire static __always_inline int arch_atomic_fetch_xor_acquire(int i, atomic_t *v) { int ret = arch_atomic_fetch_xor_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic_fetch_xor_acquire arch_atomic_fetch_xor_acquire #endif #ifndef arch_atomic_fetch_xor_release static __always_inline int arch_atomic_fetch_xor_release(int i, atomic_t *v) { __atomic_release_fence(); return arch_atomic_fetch_xor_relaxed(i, v); } #define arch_atomic_fetch_xor_release arch_atomic_fetch_xor_release #endif #ifndef arch_atomic_fetch_xor static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_fetch_xor_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic_fetch_xor arch_atomic_fetch_xor #endif #endif /* arch_atomic_fetch_xor_relaxed */ #ifndef arch_atomic_xchg_relaxed #define arch_atomic_xchg_acquire arch_atomic_xchg #define arch_atomic_xchg_release arch_atomic_xchg #define arch_atomic_xchg_relaxed arch_atomic_xchg #else /* arch_atomic_xchg_relaxed */ #ifndef arch_atomic_xchg_acquire static __always_inline int arch_atomic_xchg_acquire(atomic_t *v, int i) { int ret = arch_atomic_xchg_relaxed(v, i); __atomic_acquire_fence(); return ret; } #define arch_atomic_xchg_acquire arch_atomic_xchg_acquire #endif #ifndef arch_atomic_xchg_release static __always_inline int arch_atomic_xchg_release(atomic_t *v, int i) { __atomic_release_fence(); return arch_atomic_xchg_relaxed(v, i); } #define arch_atomic_xchg_release arch_atomic_xchg_release #endif #ifndef arch_atomic_xchg static __always_inline int arch_atomic_xchg(atomic_t *v, int i) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_xchg_relaxed(v, i); __atomic_post_full_fence(); return ret; } #define arch_atomic_xchg arch_atomic_xchg #endif #endif /* arch_atomic_xchg_relaxed */ #ifndef arch_atomic_cmpxchg_relaxed #define arch_atomic_cmpxchg_acquire arch_atomic_cmpxchg #define arch_atomic_cmpxchg_release arch_atomic_cmpxchg #define arch_atomic_cmpxchg_relaxed arch_atomic_cmpxchg #else /* arch_atomic_cmpxchg_relaxed */ #ifndef arch_atomic_cmpxchg_acquire static __always_inline int arch_atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { int ret = arch_atomic_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; } #define arch_atomic_cmpxchg_acquire arch_atomic_cmpxchg_acquire #endif #ifndef arch_atomic_cmpxchg_release static __always_inline int arch_atomic_cmpxchg_release(atomic_t *v, int old, int new) { __atomic_release_fence(); return arch_atomic_cmpxchg_relaxed(v, old, new); } #define arch_atomic_cmpxchg_release arch_atomic_cmpxchg_release #endif #ifndef arch_atomic_cmpxchg static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { int ret; __atomic_pre_full_fence(); ret = arch_atomic_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; } #define arch_atomic_cmpxchg arch_atomic_cmpxchg #endif #endif /* arch_atomic_cmpxchg_relaxed */ #ifndef arch_atomic_try_cmpxchg_relaxed #ifdef arch_atomic_try_cmpxchg #define arch_atomic_try_cmpxchg_acquire arch_atomic_try_cmpxchg #define arch_atomic_try_cmpxchg_release arch_atomic_try_cmpxchg #define arch_atomic_try_cmpxchg_relaxed arch_atomic_try_cmpxchg #endif /* arch_atomic_try_cmpxchg */ #ifndef arch_atomic_try_cmpxchg static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { int r, o = *old; r = arch_atomic_cmpxchg(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); } #define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg #endif #ifndef arch_atomic_try_cmpxchg_acquire static __always_inline bool arch_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { int r, o = *old; r = arch_atomic_cmpxchg_acquire(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); } #define arch_atomic_try_cmpxchg_acquire arch_atomic_try_cmpxchg_acquire #endif #ifndef arch_atomic_try_cmpxchg_release static __always_inline bool arch_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { int r, o = *old; r = arch_atomic_cmpxchg_release(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); } #define arch_atomic_try_cmpxchg_release arch_atomic_try_cmpxchg_release #endif #ifndef arch_atomic_try_cmpxchg_relaxed static __always_inline bool arch_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { int r, o = *old; r = arch_atomic_cmpxchg_relaxed(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); } #define arch_atomic_try_cmpxchg_relaxed arch_atomic_try_cmpxchg_relaxed #endif #else /* arch_atomic_try_cmpxchg_relaxed */ #ifndef arch_atomic_try_cmpxchg_acquire static __always_inline bool arch_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; } #define arch_atomic_try_cmpxchg_acquire arch_atomic_try_cmpxchg_acquire #endif #ifndef arch_atomic_try_cmpxchg_release static __always_inline bool arch_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { __atomic_release_fence(); return arch_atomic_try_cmpxchg_relaxed(v, old, new); } #define arch_atomic_try_cmpxchg_release arch_atomic_try_cmpxchg_release #endif #ifndef arch_atomic_try_cmpxchg static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { bool ret; __atomic_pre_full_fence(); ret = arch_atomic_try_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; } #define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg #endif #endif /* arch_atomic_try_cmpxchg_relaxed */ #ifndef arch_atomic_sub_and_test /** * arch_atomic_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer of type atomic_t * * Atomically subtracts @i from @v and returns * true if the result is zero, or false for all * other cases. */ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { return arch_atomic_sub_return(i, v) == 0; } #define arch_atomic_sub_and_test arch_atomic_sub_and_test #endif #ifndef arch_atomic_dec_and_test /** * arch_atomic_dec_and_test - decrement and test * @v: pointer of type atomic_t * * Atomically decrements @v by 1 and * returns true if the result is 0, or false for all other * cases. */ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { return arch_atomic_dec_return(v) == 0; } #define arch_atomic_dec_and_test arch_atomic_dec_and_test #endif #ifndef arch_atomic_inc_and_test /** * arch_atomic_inc_and_test - increment and test * @v: pointer of type atomic_t * * Atomically increments @v by 1 * and returns true if the result is zero, or false for all * other cases. */ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { return arch_atomic_inc_return(v) == 0; } #define arch_atomic_inc_and_test arch_atomic_inc_and_test #endif #ifndef arch_atomic_add_negative /** * arch_atomic_add_negative - add and test if negative * @i: integer value to add * @v: pointer of type atomic_t * * Atomically adds @i to @v and returns true * if the result is negative, or false when * result is greater than or equal to zero. */ static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { return arch_atomic_add_return(i, v) < 0; } #define arch_atomic_add_negative arch_atomic_add_negative #endif #ifndef arch_atomic_fetch_add_unless /** * arch_atomic_fetch_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. * * Atomically adds @a to @v, so long as @v was not already @u. * Returns original value of @v */ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c = arch_atomic_read(v); do { if (unlikely(c == u)) break; } while (!arch_atomic_try_cmpxchg(v, &c, c + a)); return c; } #define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless #endif #ifndef arch_atomic_add_unless /** * arch_atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. * * Atomically adds @a to @v, if @v was not already @u. * Returns true if the addition was done. */ static __always_inline bool arch_atomic_add_unless(atomic_t *v, int a, int u) { return arch_atomic_fetch_add_unless(v, a, u) != u; } #define arch_atomic_add_unless arch_atomic_add_unless #endif #ifndef arch_atomic_inc_not_zero /** * arch_atomic_inc_not_zero - increment unless the number is zero * @v: pointer of type atomic_t * * Atomically increments @v by 1, if @v is non-zero. * Returns true if the increment was done. */ static __always_inline bool arch_atomic_inc_not_zero(atomic_t *v) { return arch_atomic_add_unless(v, 1, 0); } #define arch_atomic_inc_not_zero arch_atomic_inc_not_zero #endif #ifndef arch_atomic_inc_unless_negative static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v) { int c = arch_atomic_read(v); do { if (unlikely(c < 0)) return false; } while (!arch_atomic_try_cmpxchg(v, &c, c + 1)); return true; } #define arch_atomic_inc_unless_negative arch_atomic_inc_unless_negative #endif #ifndef arch_atomic_dec_unless_positive static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v) { int c = arch_atomic_read(v); do { if (unlikely(c > 0)) return false; } while (!arch_atomic_try_cmpxchg(v, &c, c - 1)); return true; } #define arch_atomic_dec_unless_positive arch_atomic_dec_unless_positive #endif #ifndef arch_atomic_dec_if_positive static __always_inline int arch_atomic_dec_if_positive(atomic_t *v) { int dec, c = arch_atomic_read(v); do { dec = c - 1; if (unlikely(dec < 0)) break; } while (!arch_atomic_try_cmpxchg(v, &c, dec)); return dec; } #define arch_atomic_dec_if_positive arch_atomic_dec_if_positive #endif #ifdef CONFIG_GENERIC_ATOMIC64 #include <asm-generic/atomic64.h> #endif #ifndef arch_atomic64_read_acquire static __always_inline s64 arch_atomic64_read_acquire(const atomic64_t *v) { return smp_load_acquire(&(v)->counter); } #define arch_atomic64_read_acquire arch_atomic64_read_acquire #endif #ifndef arch_atomic64_set_release static __always_inline void arch_atomic64_set_release(atomic64_t *v, s64 i) { smp_store_release(&(v)->counter, i); } #define arch_atomic64_set_release arch_atomic64_set_release #endif #ifndef arch_atomic64_add_return_relaxed #define arch_atomic64_add_return_acquire arch_atomic64_add_return #define arch_atomic64_add_return_release arch_atomic64_add_return #define arch_atomic64_add_return_relaxed arch_atomic64_add_return #else /* arch_atomic64_add_return_relaxed */ #ifndef arch_atomic64_add_return_acquire static __always_inline s64 arch_atomic64_add_return_acquire(s64 i, atomic64_t *v) { s64 ret = arch_atomic64_add_return_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_add_return_acquire arch_atomic64_add_return_acquire #endif #ifndef arch_atomic64_add_return_release static __always_inline s64 arch_atomic64_add_return_release(s64 i, atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_add_return_relaxed(i, v); } #define arch_atomic64_add_return_release arch_atomic64_add_return_release #endif #ifndef arch_atomic64_add_return static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_add_return_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_add_return arch_atomic64_add_return #endif #endif /* arch_atomic64_add_return_relaxed */ #ifndef arch_atomic64_fetch_add_relaxed #define arch_atomic64_fetch_add_acquire arch_atomic64_fetch_add #define arch_atomic64_fetch_add_release arch_atomic64_fetch_add #define arch_atomic64_fetch_add_relaxed arch_atomic64_fetch_add #else /* arch_atomic64_fetch_add_relaxed */ #ifndef arch_atomic64_fetch_add_acquire static __always_inline s64 arch_atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { s64 ret = arch_atomic64_fetch_add_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_fetch_add_acquire arch_atomic64_fetch_add_acquire #endif #ifndef arch_atomic64_fetch_add_release static __always_inline s64 arch_atomic64_fetch_add_release(s64 i, atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_fetch_add_relaxed(i, v); } #define arch_atomic64_fetch_add_release arch_atomic64_fetch_add_release #endif #ifndef arch_atomic64_fetch_add static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_add_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_fetch_add arch_atomic64_fetch_add #endif #endif /* arch_atomic64_fetch_add_relaxed */ #ifndef arch_atomic64_sub_return_relaxed #define arch_atomic64_sub_return_acquire arch_atomic64_sub_return #define arch_atomic64_sub_return_release arch_atomic64_sub_return #define arch_atomic64_sub_return_relaxed arch_atomic64_sub_return #else /* arch_atomic64_sub_return_relaxed */ #ifndef arch_atomic64_sub_return_acquire static __always_inline s64 arch_atomic64_sub_return_acquire(s64 i, atomic64_t *v) { s64 ret = arch_atomic64_sub_return_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_sub_return_acquire arch_atomic64_sub_return_acquire #endif #ifndef arch_atomic64_sub_return_release static __always_inline s64 arch_atomic64_sub_return_release(s64 i, atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_sub_return_relaxed(i, v); } #define arch_atomic64_sub_return_release arch_atomic64_sub_return_release #endif #ifndef arch_atomic64_sub_return static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_sub_return_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_sub_return arch_atomic64_sub_return #endif #endif /* arch_atomic64_sub_return_relaxed */ #ifndef arch_atomic64_fetch_sub_relaxed #define arch_atomic64_fetch_sub_acquire arch_atomic64_fetch_sub #define arch_atomic64_fetch_sub_release arch_atomic64_fetch_sub #define arch_atomic64_fetch_sub_relaxed arch_atomic64_fetch_sub #else /* arch_atomic64_fetch_sub_relaxed */ #ifndef arch_atomic64_fetch_sub_acquire static __always_inline s64 arch_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { s64 ret = arch_atomic64_fetch_sub_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_fetch_sub_acquire arch_atomic64_fetch_sub_acquire #endif #ifndef arch_atomic64_fetch_sub_release static __always_inline s64 arch_atomic64_fetch_sub_release(s64 i, atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_fetch_sub_relaxed(i, v); } #define arch_atomic64_fetch_sub_release arch_atomic64_fetch_sub_release #endif #ifndef arch_atomic64_fetch_sub static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_sub_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_fetch_sub arch_atomic64_fetch_sub #endif #endif /* arch_atomic64_fetch_sub_relaxed */ #ifndef arch_atomic64_inc static __always_inline void arch_atomic64_inc(atomic64_t *v) { arch_atomic64_add(1, v); } #define arch_atomic64_inc arch_atomic64_inc #endif #ifndef arch_atomic64_inc_return_relaxed #ifdef arch_atomic64_inc_return #define arch_atomic64_inc_return_acquire arch_atomic64_inc_return #define arch_atomic64_inc_return_release arch_atomic64_inc_return #define arch_atomic64_inc_return_relaxed arch_atomic64_inc_return #endif /* arch_atomic64_inc_return */ #ifndef arch_atomic64_inc_return static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v) { return arch_atomic64_add_return(1, v); } #define arch_atomic64_inc_return arch_atomic64_inc_return #endif #ifndef arch_atomic64_inc_return_acquire static __always_inline s64 arch_atomic64_inc_return_acquire(atomic64_t *v) { return arch_atomic64_add_return_acquire(1, v); } #define arch_atomic64_inc_return_acquire arch_atomic64_inc_return_acquire #endif #ifndef arch_atomic64_inc_return_release static __always_inline s64 arch_atomic64_inc_return_release(atomic64_t *v) { return arch_atomic64_add_return_release(1, v); } #define arch_atomic64_inc_return_release arch_atomic64_inc_return_release #endif #ifndef arch_atomic64_inc_return_relaxed static __always_inline s64 arch_atomic64_inc_return_relaxed(atomic64_t *v) { return arch_atomic64_add_return_relaxed(1, v); } #define arch_atomic64_inc_return_relaxed arch_atomic64_inc_return_relaxed #endif #else /* arch_atomic64_inc_return_relaxed */ #ifndef arch_atomic64_inc_return_acquire static __always_inline s64 arch_atomic64_inc_return_acquire(atomic64_t *v) { s64 ret = arch_atomic64_inc_return_relaxed(v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_inc_return_acquire arch_atomic64_inc_return_acquire #endif #ifndef arch_atomic64_inc_return_release static __always_inline s64 arch_atomic64_inc_return_release(atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_inc_return_relaxed(v); } #define arch_atomic64_inc_return_release arch_atomic64_inc_return_release #endif #ifndef arch_atomic64_inc_return static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_inc_return_relaxed(v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_inc_return arch_atomic64_inc_return #endif #endif /* arch_atomic64_inc_return_relaxed */ #ifndef arch_atomic64_fetch_inc_relaxed #ifdef arch_atomic64_fetch_inc #define arch_atomic64_fetch_inc_acquire arch_atomic64_fetch_inc #define arch_atomic64_fetch_inc_release arch_atomic64_fetch_inc #define arch_atomic64_fetch_inc_relaxed arch_atomic64_fetch_inc #endif /* arch_atomic64_fetch_inc */ #ifndef arch_atomic64_fetch_inc static __always_inline s64 arch_atomic64_fetch_inc(atomic64_t *v) { return arch_atomic64_fetch_add(1, v); } #define arch_atomic64_fetch_inc arch_atomic64_fetch_inc #endif #ifndef arch_atomic64_fetch_inc_acquire static __always_inline s64 arch_atomic64_fetch_inc_acquire(atomic64_t *v) { return arch_atomic64_fetch_add_acquire(1, v); } #define arch_atomic64_fetch_inc_acquire arch_atomic64_fetch_inc_acquire #endif #ifndef arch_atomic64_fetch_inc_release static __always_inline s64 arch_atomic64_fetch_inc_release(atomic64_t *v) { return arch_atomic64_fetch_add_release(1, v); } #define arch_atomic64_fetch_inc_release arch_atomic64_fetch_inc_release #endif #ifndef arch_atomic64_fetch_inc_relaxed static __always_inline s64 arch_atomic64_fetch_inc_relaxed(atomic64_t *v) { return arch_atomic64_fetch_add_relaxed(1, v); } #define arch_atomic64_fetch_inc_relaxed arch_atomic64_fetch_inc_relaxed #endif #else /* arch_atomic64_fetch_inc_relaxed */ #ifndef arch_atomic64_fetch_inc_acquire static __always_inline s64 arch_atomic64_fetch_inc_acquire(atomic64_t *v) { s64 ret = arch_atomic64_fetch_inc_relaxed(v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_fetch_inc_acquire arch_atomic64_fetch_inc_acquire #endif #ifndef arch_atomic64_fetch_inc_release static __always_inline s64 arch_atomic64_fetch_inc_release(atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_fetch_inc_relaxed(v); } #define arch_atomic64_fetch_inc_release arch_atomic64_fetch_inc_release #endif #ifndef arch_atomic64_fetch_inc static __always_inline s64 arch_atomic64_fetch_inc(atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_inc_relaxed(v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_fetch_inc arch_atomic64_fetch_inc #endif #endif /* arch_atomic64_fetch_inc_relaxed */ #ifndef arch_atomic64_dec static __always_inline void arch_atomic64_dec(atomic64_t *v) { arch_atomic64_sub(1, v); } #define arch_atomic64_dec arch_atomic64_dec #endif #ifndef arch_atomic64_dec_return_relaxed #ifdef arch_atomic64_dec_return #define arch_atomic64_dec_return_acquire arch_atomic64_dec_return #define arch_atomic64_dec_return_release arch_atomic64_dec_return #define arch_atomic64_dec_return_relaxed arch_atomic64_dec_return #endif /* arch_atomic64_dec_return */ #ifndef arch_atomic64_dec_return static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) { return arch_atomic64_sub_return(1, v); } #define arch_atomic64_dec_return arch_atomic64_dec_return #endif #ifndef arch_atomic64_dec_return_acquire static __always_inline s64 arch_atomic64_dec_return_acquire(atomic64_t *v) { return arch_atomic64_sub_return_acquire(1, v); } #define arch_atomic64_dec_return_acquire arch_atomic64_dec_return_acquire #endif #ifndef arch_atomic64_dec_return_release static __always_inline s64 arch_atomic64_dec_return_release(atomic64_t *v) { return arch_atomic64_sub_return_release(1, v); } #define arch_atomic64_dec_return_release arch_atomic64_dec_return_release #endif #ifndef arch_atomic64_dec_return_relaxed static __always_inline s64 arch_atomic64_dec_return_relaxed(atomic64_t *v) { return arch_atomic64_sub_return_relaxed(1, v); } #define arch_atomic64_dec_return_relaxed arch_atomic64_dec_return_relaxed #endif #else /* arch_atomic64_dec_return_relaxed */ #ifndef arch_atomic64_dec_return_acquire static __always_inline s64 arch_atomic64_dec_return_acquire(atomic64_t *v) { s64 ret = arch_atomic64_dec_return_relaxed(v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_dec_return_acquire arch_atomic64_dec_return_acquire #endif #ifndef arch_atomic64_dec_return_release static __always_inline s64 arch_atomic64_dec_return_release(atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_dec_return_relaxed(v); } #define arch_atomic64_dec_return_release arch_atomic64_dec_return_release #endif #ifndef arch_atomic64_dec_return static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_dec_return_relaxed(v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_dec_return arch_atomic64_dec_return #endif #endif /* arch_atomic64_dec_return_relaxed */ #ifndef arch_atomic64_fetch_dec_relaxed #ifdef arch_atomic64_fetch_dec #define arch_atomic64_fetch_dec_acquire arch_atomic64_fetch_dec #define arch_atomic64_fetch_dec_release arch_atomic64_fetch_dec #define arch_atomic64_fetch_dec_relaxed arch_atomic64_fetch_dec #endif /* arch_atomic64_fetch_dec */ #ifndef arch_atomic64_fetch_dec static __always_inline s64 arch_atomic64_fetch_dec(atomic64_t *v) { return arch_atomic64_fetch_sub(1, v); } #define arch_atomic64_fetch_dec arch_atomic64_fetch_dec #endif #ifndef arch_atomic64_fetch_dec_acquire static __always_inline s64 arch_atomic64_fetch_dec_acquire(atomic64_t *v) { return arch_atomic64_fetch_sub_acquire(1, v); } #define arch_atomic64_fetch_dec_acquire arch_atomic64_fetch_dec_acquire #endif #ifndef arch_atomic64_fetch_dec_release static __always_inline s64 arch_atomic64_fetch_dec_release(atomic64_t *v) { return arch_atomic64_fetch_sub_release(1, v); } #define arch_atomic64_fetch_dec_release arch_atomic64_fetch_dec_release #endif #ifndef arch_atomic64_fetch_dec_relaxed static __always_inline s64 arch_atomic64_fetch_dec_relaxed(atomic64_t *v) { return arch_atomic64_fetch_sub_relaxed(1, v); } #define arch_atomic64_fetch_dec_relaxed arch_atomic64_fetch_dec_relaxed #endif #else /* arch_atomic64_fetch_dec_relaxed */ #ifndef arch_atomic64_fetch_dec_acquire static __always_inline s64 arch_atomic64_fetch_dec_acquire(atomic64_t *v) { s64 ret = arch_atomic64_fetch_dec_relaxed(v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_fetch_dec_acquire arch_atomic64_fetch_dec_acquire #endif #ifndef arch_atomic64_fetch_dec_release static __always_inline s64 arch_atomic64_fetch_dec_release(atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_fetch_dec_relaxed(v); } #define arch_atomic64_fetch_dec_release arch_atomic64_fetch_dec_release #endif #ifndef arch_atomic64_fetch_dec static __always_inline s64 arch_atomic64_fetch_dec(atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_dec_relaxed(v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_fetch_dec arch_atomic64_fetch_dec #endif #endif /* arch_atomic64_fetch_dec_relaxed */ #ifndef arch_atomic64_fetch_and_relaxed #define arch_atomic64_fetch_and_acquire arch_atomic64_fetch_and #define arch_atomic64_fetch_and_release arch_atomic64_fetch_and #define arch_atomic64_fetch_and_relaxed arch_atomic64_fetch_and #else /* arch_atomic64_fetch_and_relaxed */ #ifndef arch_atomic64_fetch_and_acquire static __always_inline s64 arch_atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { s64 ret = arch_atomic64_fetch_and_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_fetch_and_acquire arch_atomic64_fetch_and_acquire #endif #ifndef arch_atomic64_fetch_and_release static __always_inline s64 arch_atomic64_fetch_and_release(s64 i, atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_fetch_and_relaxed(i, v); } #define arch_atomic64_fetch_and_release arch_atomic64_fetch_and_release #endif #ifndef arch_atomic64_fetch_and static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_and_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_fetch_and arch_atomic64_fetch_and #endif #endif /* arch_atomic64_fetch_and_relaxed */ #ifndef arch_atomic64_andnot static __always_inline void arch_atomic64_andnot(s64 i, atomic64_t *v) { arch_atomic64_and(~i, v); } #define arch_atomic64_andnot arch_atomic64_andnot #endif #ifndef arch_atomic64_fetch_andnot_relaxed #ifdef arch_atomic64_fetch_andnot #define arch_atomic64_fetch_andnot_acquire arch_atomic64_fetch_andnot #define arch_atomic64_fetch_andnot_release arch_atomic64_fetch_andnot #define arch_atomic64_fetch_andnot_relaxed arch_atomic64_fetch_andnot #endif /* arch_atomic64_fetch_andnot */ #ifndef arch_atomic64_fetch_andnot static __always_inline s64 arch_atomic64_fetch_andnot(s64 i, atomic64_t *v) { return arch_atomic64_fetch_and(~i, v); } #define arch_atomic64_fetch_andnot arch_atomic64_fetch_andnot #endif #ifndef arch_atomic64_fetch_andnot_acquire static __always_inline s64 arch_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { return arch_atomic64_fetch_and_acquire(~i, v); } #define arch_atomic64_fetch_andnot_acquire arch_atomic64_fetch_andnot_acquire #endif #ifndef arch_atomic64_fetch_andnot_release static __always_inline s64 arch_atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { return arch_atomic64_fetch_and_release(~i, v); } #define arch_atomic64_fetch_andnot_release arch_atomic64_fetch_andnot_release #endif #ifndef arch_atomic64_fetch_andnot_relaxed static __always_inline s64 arch_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { return arch_atomic64_fetch_and_relaxed(~i, v); } #define arch_atomic64_fetch_andnot_relaxed arch_atomic64_fetch_andnot_relaxed #endif #else /* arch_atomic64_fetch_andnot_relaxed */ #ifndef arch_atomic64_fetch_andnot_acquire static __always_inline s64 arch_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_fetch_andnot_acquire arch_atomic64_fetch_andnot_acquire #endif #ifndef arch_atomic64_fetch_andnot_release static __always_inline s64 arch_atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_fetch_andnot_relaxed(i, v); } #define arch_atomic64_fetch_andnot_release arch_atomic64_fetch_andnot_release #endif #ifndef arch_atomic64_fetch_andnot static __always_inline s64 arch_atomic64_fetch_andnot(s64 i, atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_andnot_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_fetch_andnot arch_atomic64_fetch_andnot #endif #endif /* arch_atomic64_fetch_andnot_relaxed */ #ifndef arch_atomic64_fetch_or_relaxed #define arch_atomic64_fetch_or_acquire arch_atomic64_fetch_or #define arch_atomic64_fetch_or_release arch_atomic64_fetch_or #define arch_atomic64_fetch_or_relaxed arch_atomic64_fetch_or #else /* arch_atomic64_fetch_or_relaxed */ #ifndef arch_atomic64_fetch_or_acquire static __always_inline s64 arch_atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { s64 ret = arch_atomic64_fetch_or_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_fetch_or_acquire arch_atomic64_fetch_or_acquire #endif #ifndef arch_atomic64_fetch_or_release static __always_inline s64 arch_atomic64_fetch_or_release(s64 i, atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_fetch_or_relaxed(i, v); } #define arch_atomic64_fetch_or_release arch_atomic64_fetch_or_release #endif #ifndef arch_atomic64_fetch_or static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_or_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_fetch_or arch_atomic64_fetch_or #endif #endif /* arch_atomic64_fetch_or_relaxed */ #ifndef arch_atomic64_fetch_xor_relaxed #define arch_atomic64_fetch_xor_acquire arch_atomic64_fetch_xor #define arch_atomic64_fetch_xor_release arch_atomic64_fetch_xor #define arch_atomic64_fetch_xor_relaxed arch_atomic64_fetch_xor #else /* arch_atomic64_fetch_xor_relaxed */ #ifndef arch_atomic64_fetch_xor_acquire static __always_inline s64 arch_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { s64 ret = arch_atomic64_fetch_xor_relaxed(i, v); __atomic_acquire_fence(); return ret; } #define arch_atomic64_fetch_xor_acquire arch_atomic64_fetch_xor_acquire #endif #ifndef arch_atomic64_fetch_xor_release static __always_inline s64 arch_atomic64_fetch_xor_release(s64 i, atomic64_t *v) { __atomic_release_fence(); return arch_atomic64_fetch_xor_relaxed(i, v); } #define arch_atomic64_fetch_xor_release arch_atomic64_fetch_xor_release #endif #ifndef arch_atomic64_fetch_xor static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_fetch_xor_relaxed(i, v); __atomic_post_full_fence(); return ret; } #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor #endif #endif /* arch_atomic64_fetch_xor_relaxed */ #ifndef arch_atomic64_xchg_relaxed #define arch_atomic64_xchg_acquire arch_atomic64_xchg #define arch_atomic64_xchg_release arch_atomic64_xchg #define arch_atomic64_xchg_relaxed arch_atomic64_xchg #else /* arch_atomic64_xchg_relaxed */ #ifndef arch_atomic64_xchg_acquire static __always_inline s64 arch_atomic64_xchg_acquire(atomic64_t *v, s64 i) { s64 ret = arch_atomic64_xchg_relaxed(v, i); __atomic_acquire_fence(); return ret; } #define arch_atomic64_xchg_acquire arch_atomic64_xchg_acquire #endif #ifndef arch_atomic64_xchg_release static __always_inline s64 arch_atomic64_xchg_release(atomic64_t *v, s64 i) { __atomic_release_fence(); return arch_atomic64_xchg_relaxed(v, i); } #define arch_atomic64_xchg_release arch_atomic64_xchg_release #endif #ifndef arch_atomic64_xchg static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 i) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_xchg_relaxed(v, i); __atomic_post_full_fence(); return ret; } #define arch_atomic64_xchg arch_atomic64_xchg #endif #endif /* arch_atomic64_xchg_relaxed */ #ifndef arch_atomic64_cmpxchg_relaxed #define arch_atomic64_cmpxchg_acquire arch_atomic64_cmpxchg #define arch_atomic64_cmpxchg_release arch_atomic64_cmpxchg #define arch_atomic64_cmpxchg_relaxed arch_atomic64_cmpxchg #else /* arch_atomic64_cmpxchg_relaxed */ #ifndef arch_atomic64_cmpxchg_acquire static __always_inline s64 arch_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; } #define arch_atomic64_cmpxchg_acquire arch_atomic64_cmpxchg_acquire #endif #ifndef arch_atomic64_cmpxchg_release static __always_inline s64 arch_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { __atomic_release_fence(); return arch_atomic64_cmpxchg_relaxed(v, old, new); } #define arch_atomic64_cmpxchg_release arch_atomic64_cmpxchg_release #endif #ifndef arch_atomic64_cmpxchg static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { s64 ret; __atomic_pre_full_fence(); ret = arch_atomic64_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg #endif #endif /* arch_atomic64_cmpxchg_relaxed */ #ifndef arch_atomic64_try_cmpxchg_relaxed #ifdef arch_atomic64_try_cmpxchg #define arch_atomic64_try_cmpxchg_acquire arch_atomic64_try_cmpxchg #define arch_atomic64_try_cmpxchg_release arch_atomic64_try_cmpxchg #define arch_atomic64_try_cmpxchg_relaxed arch_atomic64_try_cmpxchg #endif /* arch_atomic64_try_cmpxchg */ #ifndef arch_atomic64_try_cmpxchg static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; r = arch_atomic64_cmpxchg(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg #endif #ifndef arch_atomic64_try_cmpxchg_acquire static __always_inline bool arch_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; r = arch_atomic64_cmpxchg_acquire(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); } #define arch_atomic64_try_cmpxchg_acquire arch_atomic64_try_cmpxchg_acquire #endif #ifndef arch_atomic64_try_cmpxchg_release static __always_inline bool arch_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; r = arch_atomic64_cmpxchg_release(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); } #define arch_atomic64_try_cmpxchg_release arch_atomic64_try_cmpxchg_release #endif #ifndef arch_atomic64_try_cmpxchg_relaxed static __always_inline bool arch_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { s64 r, o = *old; r = arch_atomic64_cmpxchg_relaxed(v, o, new); if (unlikely(r != o)) *old = r; return likely(r == o); } #define arch_atomic64_try_cmpxchg_relaxed arch_atomic64_try_cmpxchg_relaxed #endif #else /* arch_atomic64_try_cmpxchg_relaxed */ #ifndef arch_atomic64_try_cmpxchg_acquire static __always_inline bool arch_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new); __atomic_acquire_fence(); return ret; } #define arch_atomic64_try_cmpxchg_acquire arch_atomic64_try_cmpxchg_acquire #endif #ifndef arch_atomic64_try_cmpxchg_release static __always_inline bool arch_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { __atomic_release_fence(); return arch_atomic64_try_cmpxchg_relaxed(v, old, new); } #define arch_atomic64_try_cmpxchg_release arch_atomic64_try_cmpxchg_release #endif #ifndef arch_atomic64_try_cmpxchg static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { bool ret; __atomic_pre_full_fence(); ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new); __atomic_post_full_fence(); return ret; } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg #endif #endif /* arch_atomic64_try_cmpxchg_relaxed */ #ifndef arch_atomic64_sub_and_test /** * arch_atomic64_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer of type atomic64_t * * Atomically subtracts @i from @v and returns * true if the result is zero, or false for all * other cases. */ static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) { return arch_atomic64_sub_return(i, v) == 0; } #define arch_atomic64_sub_and_test arch_atomic64_sub_and_test #endif #ifndef arch_atomic64_dec_and_test /** * arch_atomic64_dec_and_test - decrement and test * @v: pointer of type atomic64_t * * Atomically decrements @v by 1 and * returns true if the result is 0, or false for all other * cases. */ static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v) { return arch_atomic64_dec_return(v) == 0; } #define arch_atomic64_dec_and_test arch_atomic64_dec_and_test #endif #ifndef arch_atomic64_inc_and_test /** * arch_atomic64_inc_and_test - increment and test * @v: pointer of type atomic64_t * * Atomically increments @v by 1 * and returns true if the result is zero, or false for all * other cases. */ static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v) { return arch_atomic64_inc_return(v) == 0; } #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test #endif #ifndef arch_atomic64_add_negative /** * arch_atomic64_add_negative - add and test if negative * @i: integer value to add * @v: pointer of type atomic64_t * * Atomically adds @i to @v and returns true * if the result is negative, or false when * result is greater than or equal to zero. */ static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) { return arch_atomic64_add_return(i, v) < 0; } #define arch_atomic64_add_negative arch_atomic64_add_negative #endif #ifndef arch_atomic64_fetch_add_unless /** * arch_atomic64_fetch_add_unless - add unless the number is already a given value * @v: pointer of type atomic64_t * @a: the amount to add to v... * @u: ...unless v is equal to u. * * Atomically adds @a to @v, so long as @v was not already @u. * Returns original value of @v */ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { s64 c = arch_atomic64_read(v); do { if (unlikely(c == u)) break; } while (!arch_atomic64_try_cmpxchg(v, &c, c + a)); return c; } #define arch_atomic64_fetch_add_unless arch_atomic64_fetch_add_unless #endif #ifndef arch_atomic64_add_unless /** * arch_atomic64_add_unless - add unless the number is already a given value * @v: pointer of type atomic64_t * @a: the amount to add to v... * @u: ...unless v is equal to u. * * Atomically adds @a to @v, if @v was not already @u. * Returns true if the addition was done. */ static __always_inline bool arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { return arch_atomic64_fetch_add_unless(v, a, u) != u; } #define arch_atomic64_add_unless arch_atomic64_add_unless #endif #ifndef arch_atomic64_inc_not_zero /** * arch_atomic64_inc_not_zero - increment unless the number is zero * @v: pointer of type atomic64_t * * Atomically increments @v by 1, if @v is non-zero. * Returns true if the increment was done. */ static __always_inline bool arch_atomic64_inc_not_zero(atomic64_t *v) { return arch_atomic64_add_unless(v, 1, 0); } #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero #endif #ifndef arch_atomic64_inc_unless_negative static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v) { s64 c = arch_atomic64_read(v); do { if (unlikely(c < 0)) return false; } while (!arch_atomic64_try_cmpxchg(v, &c, c + 1)); return true; } #define arch_atomic64_inc_unless_negative arch_atomic64_inc_unless_negative #endif #ifndef arch_atomic64_dec_unless_positive static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v) { s64 c = arch_atomic64_read(v); do { if (unlikely(c > 0)) return false; } while (!arch_atomic64_try_cmpxchg(v, &c, c - 1)); return true; } #define arch_atomic64_dec_unless_positive arch_atomic64_dec_unless_positive #endif #ifndef arch_atomic64_dec_if_positive static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { s64 dec, c = arch_atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) break; } while (!arch_atomic64_try_cmpxchg(v, &c, dec)); return dec; } #define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive #endif #endif /* _LINUX_ATOMIC_FALLBACK_H */ // 90cd26cfd69d2250303d654955a0cc12620fb91b
9 3 9 9 8 8 8 9 8 8 9 9 9 9 9 9 9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifing that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (atomic_dec_and_test(&parent->count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /** * idmap_key struct holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; /* == 0 unless used with map_id_range_down() */ }; /** * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /** * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /** * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } static u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /** * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { unsigned idx; u32 first, last; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) return &map->extent[idx]; } return NULL; } /** * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } static u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_up_base(extents, map, id); else extent = map_id_up_max(extents, map, id); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /** * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /** * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup(map->forward, map->nr_extents * sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certaint the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also had the approprpiate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); return 0; } subsys_initcall(user_namespaces_init);
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* SPDX-License-Identifier: GPL-2.0 */ /* * Access vector cache interface for object managers. * * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SELINUX_AVC_H_ #define _SELINUX_AVC_H_ #include <linux/stddef.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/kdev_t.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/audit.h> #include <linux/lsm_audit.h> #include <linux/in6.h> #include "flask.h" #include "av_permissions.h" #include "security.h" /* * An entry in the AVC. */ struct avc_entry; struct task_struct; struct inode; struct sock; struct sk_buff; /* * AVC statistics */ struct avc_cache_stats { unsigned int lookups; unsigned int misses; unsigned int allocations; unsigned int reclaims; unsigned int frees; }; /* * We only need this data after we have decided to send an audit message. */ struct selinux_audit_data { u32 ssid; u32 tsid; u16 tclass; u32 requested; u32 audited; u32 denied; int result; struct selinux_state *state; }; /* * AVC operations */ void __init avc_init(void); static inline u32 avc_audit_required(u32 requested, struct av_decision *avd, int result, u32 auditdeny, u32 *deniedp) { u32 denied, audited; denied = requested & ~avd->allowed; if (unlikely(denied)) { audited = denied & avd->auditdeny; /* * auditdeny is TRICKY! Setting a bit in * this field means that ANY denials should NOT be audited if * the policy contains an explicit dontaudit rule for that * permission. Take notice that this is unrelated to the * actual permissions that were denied. As an example lets * assume: * * denied == READ * avd.auditdeny & ACCESS == 0 (not set means explicit rule) * auditdeny & ACCESS == 1 * * We will NOT audit the denial even though the denied * permission was READ and the auditdeny checks were for * ACCESS */ if (auditdeny && !(auditdeny & avd->auditdeny)) audited = 0; } else if (result) audited = denied = requested; else audited = requested & avd->auditallow; *deniedp = denied; return audited; } int slow_avc_audit(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, u32 audited, u32 denied, int result, struct common_audit_data *a); /** * avc_audit - Audit the granting or denial of permissions. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @requested: requested permissions * @avd: access vector decisions * @result: result from avc_has_perm_noaudit * @a: auxiliary audit data * @flags: VFS walk flags * * Audit the granting or denial of permissions in accordance * with the policy. This function is typically called by * avc_has_perm() after a permission check, but can also be * called directly by callers who use avc_has_perm_noaudit() * in order to separate the permission check from the auditing. * For example, this separation is useful when the permission check must * be performed under a lock, to allow the lock to be released * before calling the auditing code. */ static inline int avc_audit(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *avd, int result, struct common_audit_data *a, int flags) { u32 audited, denied; audited = avc_audit_required(requested, avd, result, 0, &denied); if (likely(!audited)) return 0; /* fall back to ref-walk if we have to generate audit */ if (flags & MAY_NOT_BLOCK) return -ECHILD; return slow_avc_audit(state, ssid, tsid, tclass, requested, audited, denied, result, a); } #define AVC_STRICT 1 /* Ignore permissive mode. */ #define AVC_EXTENDED_PERMS 2 /* update extended permissions */ #define AVC_NONBLOCKING 4 /* non blocking */ int avc_has_perm_noaudit(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, unsigned flags, struct av_decision *avd); int avc_has_perm(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata); int avc_has_perm_flags(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata, int flags); int avc_has_extended_perms(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, u8 driver, u8 perm, struct common_audit_data *ad); u32 avc_policy_seqno(struct selinux_state *state); #define AVC_CALLBACK_GRANT 1 #define AVC_CALLBACK_TRY_REVOKE 2 #define AVC_CALLBACK_REVOKE 4 #define AVC_CALLBACK_RESET 8 #define AVC_CALLBACK_AUDITALLOW_ENABLE 16 #define AVC_CALLBACK_AUDITALLOW_DISABLE 32 #define AVC_CALLBACK_AUDITDENY_ENABLE 64 #define AVC_CALLBACK_AUDITDENY_DISABLE 128 #define AVC_CALLBACK_ADD_XPERMS 256 int avc_add_callback(int (*callback)(u32 event), u32 events); /* Exported to selinuxfs */ struct selinux_avc; int avc_get_hash_stats(struct selinux_avc *avc, char *page); unsigned int avc_get_cache_threshold(struct selinux_avc *avc); void avc_set_cache_threshold(struct selinux_avc *avc, unsigned int cache_threshold); /* Attempt to free avc node cache */ void avc_disable(void); #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS DECLARE_PER_CPU(struct avc_cache_stats, avc_cache_stats); #endif #endif /* _SELINUX_AVC_H_ */
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 // SPDX-License-Identifier: GPL-2.0 /* * ring buffer based function tracer * * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com> * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> * * Originally taken from the RT patch by: * Arnaldo Carvalho de Melo <acme@redhat.com> * * Based on code from the latency_tracer, that is: * Copyright (C) 2004-2006 Ingo Molnar * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> #include <generated/utsrelease.h> #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/irqflags.h> #include <linux/debugfs.h> #include <linux/tracefs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> #include <linux/linkage.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/splice.h> #include <linux/kdebug.h> #include <linux/string.h> #include <linux/mount.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/kmemleak.h> #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> #include <linux/trace.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h> #include <linux/fsnotify.h> #include <linux/irq_work.h> #include <linux/workqueue.h> #include "trace.h" #include "trace_output.h" /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ bool ring_buffer_expanded; /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the * entries inserted during the selftest although some concurrent * insertions into the ring-buffer such as trace_printk could occurred * at the same time, giving false positive or negative results. */ static bool __read_mostly tracing_selftest_running; /* * If boot-time tracing including tracers/events via kernel cmdline * is running, we do not want to run SELFTEST. */ bool __read_mostly tracing_selftest_disabled; #ifdef CONFIG_FTRACE_STARTUP_TEST void __init disable_tracing_selftest(const char *reason) { if (!tracing_selftest_disabled) { tracing_selftest_disabled = true; pr_info("Ftrace startup test is disabled due to %s\n", reason); } } #endif /* Pipe tracepoints to printk */ struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { { } }; static int dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return 0; } /* * To prevent the comm cache from being overwritten when no * tracing is active, only save the comm when a trace event * occurred. */ static DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization * of the tracer is successful. But that is the only place that sets * this back to zero. */ static int tracing_disabled = 1; cpumask_var_t __read_mostly tracing_buffer_mask; /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * * If there is an oops (or kernel panic) and the ftrace_dump_on_oops * is set, then ftrace_dump is called. This will output the contents * of the ftrace buffers to the console. This is very useful for * capturing traces that lead to crashes and outputing it to a * serial console. * * It is default off, but you can enable it with either specifying * "ftrace_dump_on_oops" in the kernel command line, or setting * /proc/sys/kernel/ftrace_dump_on_oops * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops */ enum ftrace_dump_mode ftrace_dump_on_oops; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; #ifdef CONFIG_TRACE_EVAL_MAP_FILE /* Map of enums to their values, for "eval_map" file */ struct trace_eval_map_head { struct module *mod; unsigned long length; }; union trace_eval_map_item; struct trace_eval_map_tail { /* * "end" is first and points to NULL as it must be different * than "mod" or "eval_string" */ union trace_eval_map_item *next; const char *end; /* points to NULL */ }; static DEFINE_MUTEX(trace_eval_mutex); /* * The trace_eval_maps are saved in an array with two extra elements, * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a * pointer to the next array of saved eval_map items. */ union trace_eval_map_item { struct trace_eval_map map; struct trace_eval_map_head head; struct trace_eval_map_tail tail; }; static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ int tracing_set_tracer(struct trace_array *tr, const char *buf); static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned long flags, int pc); #define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; static bool allocate_snapshot; static int __init set_cmdline_ftrace(char *str) { strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = true; return 1; } __setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { if (*str++ != '=' || !*str) { ftrace_dump_on_oops = DUMP_ALL; return 1; } if (!strcmp("orig_cpu", str)) { ftrace_dump_on_oops = DUMP_ORIG; return 1; } return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); static int __init stop_trace_on_warning(char *str) { if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) __disable_trace_on_warning = 1; return 1; } __setup("traceoff_on_warning", stop_trace_on_warning); static int __init boot_alloc_snapshot(char *str) { allocate_snapshot = true; /* We also need the main ring buffer expanded */ ring_buffer_expanded = true; return 1; } __setup("alloc_snapshot", boot_alloc_snapshot); static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); return 1; } __setup("trace_options=", set_trace_boot_options); static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; static char *trace_boot_clock __initdata; static int __init set_trace_boot_clock(char *str) { strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; return 1; } __setup("trace_clock=", set_trace_boot_clock); static int __init set_tracepoint_printk(char *str) { /* Ignore the "tp_printk_stop_on_boot" param */ if (*str == '_') return 0; if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) tracepoint_printk = 1; return 1; } __setup("tp_printk", set_tracepoint_printk); unsigned long long ns2usecs(u64 nsec) { nsec += 500; do_div(nsec, 1000); return nsec; } static void trace_process_export(struct trace_export *export, struct ring_buffer_event *event, int flag) { struct trace_entry *entry; unsigned int size = 0; if (export->flags & flag) { entry = ring_buffer_event_data(event); size = ring_buffer_event_length(event); export->write(export, entry, size); } } static DEFINE_MUTEX(ftrace_export_lock); static struct trace_export __rcu *ftrace_exports_list __read_mostly; static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled); static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled); static inline void ftrace_exports_enable(struct trace_export *export) { if (export->flags & TRACE_EXPORT_FUNCTION) static_branch_inc(&trace_function_exports_enabled); if (export->flags & TRACE_EXPORT_EVENT) static_branch_inc(&trace_event_exports_enabled); if (export->flags & TRACE_EXPORT_MARKER) static_branch_inc(&trace_marker_exports_enabled); } static inline void ftrace_exports_disable(struct trace_export *export) { if (export->flags & TRACE_EXPORT_FUNCTION) static_branch_dec(&trace_function_exports_enabled); if (export->flags & TRACE_EXPORT_EVENT) static_branch_dec(&trace_event_exports_enabled); if (export->flags & TRACE_EXPORT_MARKER) static_branch_dec(&trace_marker_exports_enabled); } static void ftrace_exports(struct ring_buffer_event *event, int flag) { struct trace_export *export; preempt_disable_notrace(); export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event, flag); export = rcu_dereference_raw_check(export->next); } preempt_enable_notrace(); } static inline void add_trace_export(struct trace_export **list, struct trace_export *export) { rcu_assign_pointer(export->next, *list); /* * We are entering export into the list but another * CPU might be walking that list. We need to make sure * the export->next pointer is valid before another CPU sees * the export pointer included into the list. */ rcu_assign_pointer(*list, export); } static inline int rm_trace_export(struct trace_export **list, struct trace_export *export) { struct trace_export **p; for (p = list; *p != NULL; p = &(*p)->next) if (*p == export) break; if (*p != export) return -1; rcu_assign_pointer(*p, (*p)->next); return 0; } static inline void add_ftrace_export(struct trace_export **list, struct trace_export *export) { ftrace_exports_enable(export); add_trace_export(list, export); } static inline int rm_ftrace_export(struct trace_export **list, struct trace_export *export) { int ret; ret = rm_trace_export(list, export); ftrace_exports_disable(export); return ret; } int register_ftrace_export(struct trace_export *export) { if (WARN_ON_ONCE(!export->write)) return -1; mutex_lock(&ftrace_export_lock); add_ftrace_export(&ftrace_exports_list, export); mutex_unlock(&ftrace_export_lock); return 0; } EXPORT_SYMBOL_GPL(register_ftrace_export); int unregister_ftrace_export(struct trace_export *export) { int ret; mutex_lock(&ftrace_export_lock); ret = rm_ftrace_export(&ftrace_exports_list, export); mutex_unlock(&ftrace_export_lock); return ret; } EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ (FUNCTION_DEFAULT_FLAGS | \ TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS) /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK) /* * The global_trace is the descriptor that holds the top-level tracing * buffers for the live tracing. */ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, }; LIST_HEAD(ftrace_trace_arrays); int trace_array_get(struct trace_array *this_tr) { struct trace_array *tr; int ret = -ENODEV; mutex_lock(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { tr->ref++; ret = 0; break; } } mutex_unlock(&trace_types_lock); return ret; } static void __trace_array_put(struct trace_array *this_tr) { WARN_ON(!this_tr->ref); this_tr->ref--; } /** * trace_array_put - Decrement the reference counter for this trace array. * * NOTE: Use this when we no longer need the trace array returned by * trace_array_get_by_name(). This ensures the trace array can be later * destroyed. * */ void trace_array_put(struct trace_array *this_tr) { if (!this_tr) return; mutex_lock(&trace_types_lock); __trace_array_put(this_tr); mutex_unlock(&trace_types_lock); } EXPORT_SYMBOL_GPL(trace_array_put); int tracing_check_open_get_tr(struct trace_array *tr) { int ret; ret = security_locked_down(LOCKDOWN_TRACEFS); if (ret) return ret; if (tracing_disabled) return -ENODEV; if (tr && trace_array_get(tr) < 0) return -ENODEV; return 0; } int call_filter_check_discard(struct trace_event_call *call, void *rec, struct trace_buffer *buffer, struct ring_buffer_event *event) { if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && !filter_match_preds(call->filter, rec)) { __trace_event_discard_commit(buffer, event); return 1; } return 0; } void trace_free_pid_list(struct trace_pid_list *pid_list) { vfree(pid_list->pids); kfree(pid_list); } /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check * @search_pid: The PID to find in @filtered_pids * * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. */ bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { /* * If pid_max changed after filtered_pids was created, we * by default ignore all pids greater than the previous pid_max. */ if (search_pid >= filtered_pids->pid_max) return false; return test_bit(search_pid, filtered_pids->pids); } /** * trace_ignore_this_task - should a task be ignored for tracing * @filtered_pids: The list of pids to check * @task: The task that should be ignored if not filtered * * Checks if @task should be traced or not from @filtered_pids. * Returns true if @task should *NOT* be traced. * Returns false if @task should be traced. */ bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct trace_pid_list *filtered_no_pids, struct task_struct *task) { /* * If filterd_no_pids is not empty, and the task's pid is listed * in filtered_no_pids, then return true. * Otherwise, if filtered_pids is empty, that means we can * trace all tasks. If it has content, then only trace pids * within filtered_pids. */ return (filtered_pids && !trace_find_filtered_pid(filtered_pids, task->pid)) || (filtered_no_pids && trace_find_filtered_pid(filtered_no_pids, task->pid)); } /** * trace_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove * * If adding a task, if @self is defined, the task is only added if @self * is also included in @pid_list. This happens on fork and tasks should * only be added when the parent is listed. If @self is NULL, then the * @task pid will be removed from the list, which would happen on exit * of a task. */ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, struct task_struct *self, struct task_struct *task) { if (!pid_list) return; /* For forks, we only add if the forking task is listed */ if (self) { if (!trace_find_filtered_pid(pid_list, self->pid)) return; } /* Sorry, but we don't support pid_max changing after setting */ if (task->pid >= pid_list->pid_max) return; /* "self" is set for forks, and NULL for exits */ if (self) set_bit(task->pid, pid_list->pids); else clear_bit(task->pid, pid_list->pids); } /** * trace_pid_next - Used for seq_file to get to the next pid of a pid_list * @pid_list: The pid list to show * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) * @pos: The position of the file * * This is used by the seq_file "next" operation to iterate the pids * listed in a trace_pid_list structure. * * Returns the pid+1 as we want to display pid of zero, but NULL would * stop the iteration. */ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) { unsigned long pid = (unsigned long)v; (*pos)++; /* pid already is +1 of the actual prevous bit */ pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); /* Return pid + 1 to allow zero to be represented */ if (pid < pid_list->pid_max) return (void *)(pid + 1); return NULL; } /** * trace_pid_start - Used for seq_file to start reading pid lists * @pid_list: The pid list to show * @pos: The position of the file * * This is used by seq_file "start" operation to start the iteration * of listing pids. * * Returns the pid+1 as we want to display pid of zero, but NULL would * stop the iteration. */ void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) { unsigned long pid; loff_t l = 0; pid = find_first_bit(pid_list->pids, pid_list->pid_max); if (pid >= pid_list->pid_max) return NULL; /* Return pid + 1 so that zero can be the exit value */ for (pid++; pid && l < *pos; pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) ; return (void *)pid; } /** * trace_pid_show - show the current pid in seq_file processing * @m: The seq_file structure to write into * @v: A void pointer of the pid (+1) value to display * * Can be directly used by seq_file operations to display the current * pid value. */ int trace_pid_show(struct seq_file *m, void *v) { unsigned long pid = (unsigned long)v - 1; seq_printf(m, "%lu\n", pid); return 0; } /* 128 should be much more than enough */ #define PID_BUF_SIZE 127 int trace_pid_write(struct trace_pid_list *filtered_pids, struct trace_pid_list **new_pid_list, const char __user *ubuf, size_t cnt) { struct trace_pid_list *pid_list; struct trace_parser parser; unsigned long val; int nr_pids = 0; ssize_t read = 0; ssize_t ret = 0; loff_t pos; pid_t pid; if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) return -ENOMEM; /* * Always recreate a new array. The write is an all or nothing * operation. Always create a new array when adding new pids by * the user. If the operation fails, then the current list is * not modified. */ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); if (!pid_list) { trace_parser_put(&parser); return -ENOMEM; } pid_list->pid_max = READ_ONCE(pid_max); /* Only truncating will shrink pid_max */ if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) pid_list->pid_max = filtered_pids->pid_max; pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); if (!pid_list->pids) { trace_parser_put(&parser); kfree(pid_list); return -ENOMEM; } if (filtered_pids) { /* copy the current bits to the new max */ for_each_set_bit(pid, filtered_pids->pids, filtered_pids->pid_max) { set_bit(pid, pid_list->pids); nr_pids++; } } while (cnt > 0) { pos = 0; ret = trace_get_user(&parser, ubuf, cnt, &pos); if (ret < 0 || !trace_parser_loaded(&parser)) break; read += ret; ubuf += ret; cnt -= ret; ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; if (val >= pid_list->pid_max) break; pid = (pid_t)val; set_bit(pid, pid_list->pids); nr_pids++; trace_parser_clear(&parser); ret = 0; } trace_parser_put(&parser); if (ret < 0) { trace_free_pid_list(pid_list); return ret; } if (!nr_pids) { /* Cleared the list of pids */ trace_free_pid_list(pid_list); read = ret; pid_list = NULL; } *new_pid_list = pid_list; return read; } static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) { u64 ts; /* Early boot up does not have a buffer yet */ if (!buf->buffer) return trace_clock_local(); ts = ring_buffer_time_stamp(buf->buffer, cpu); ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; } u64 ftrace_now(int cpu) { return buffer_ftrace_now(&global_trace.array_buffer, cpu); } /** * tracing_is_enabled - Show if global_trace has been disabled * * Shows if the global trace has been enabled or not. It uses the * mirror flag "buffer_disabled" to be used in fast paths such as for * the irqsoff tracer. But it may be inaccurate due to races. If you * need to know the accurate state, use tracing_is_on() which is a little * slower, but accurate. */ int tracing_is_enabled(void) { /* * For quick access (irqsoff uses this in fast path), just * return the mirror variable of the state of the ring buffer. * It's a little racy, but we don't really care. */ smp_rmb(); return !global_trace.buffer_disabled; } /* * trace_buf_size is the size in bytes that is allocated * for a buffer. Note, the number of bytes is always rounded * to page size. * * This number is purposely set to a low number of 16384. * If the dump on oops happens, it will be much appreciated * to not have to wait for all that output. Anyway this can be * boot time and run time configurable. */ #define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; /* * trace_types_lock is used to protect the trace_types list. */ DEFINE_MUTEX(trace_types_lock); /* * serialize the access of the ring buffer * * ring buffer serializes readers, but it is low level protection. * The validity of the events (which returns by ring_buffer_peek() ..etc) * are not protected by ring buffer. * * The content of events may become garbage if we allow other process consumes * these events concurrently: * A) the page of the consumed events may become a normal page * (not reader page) in ring buffer, and this page will be rewrited * by events producer. * B) The page of the consumed events may become a page for splice_read, * and this page will be returned to system. * * These primitives allow multi process access to different cpu ring buffer * concurrently. * * These primitives don't distinguish read-only and read-consume access. * Multi read-only access are also serialized. */ #ifdef CONFIG_SMP static DECLARE_RWSEM(all_cpu_access_lock); static DEFINE_PER_CPU(struct mutex, cpu_access_lock); static inline void trace_access_lock(int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { /* gain it for accessing the whole ring buffer. */ down_write(&all_cpu_access_lock); } else { /* gain it for accessing a cpu ring buffer. */ /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */ down_read(&all_cpu_access_lock); /* Secondly block other access to this @cpu ring buffer. */ mutex_lock(&per_cpu(cpu_access_lock, cpu)); } } static inline void trace_access_unlock(int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { up_write(&all_cpu_access_lock); } else { mutex_unlock(&per_cpu(cpu_access_lock, cpu)); up_read(&all_cpu_access_lock); } } static inline void trace_access_lock_init(void) { int cpu; for_each_possible_cpu(cpu) mutex_init(&per_cpu(cpu_access_lock, cpu)); } #else static DEFINE_MUTEX(access_lock); static inline void trace_access_lock(int cpu) { (void)cpu; mutex_lock(&access_lock); } static inline void trace_access_unlock(int cpu) { (void)cpu; mutex_unlock(&access_lock); } static inline void trace_access_lock_init(void) { } #endif #ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs); static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs); #else static inline void __ftrace_trace_stack(struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { } static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { } #endif static __always_inline void trace_event_setup(struct ring_buffer_event *event, int type, unsigned long flags, int pc) { struct trace_entry *ent = ring_buffer_event_data(event); tracing_generic_entry_update(ent, type, flags, pc); } static __always_inline struct ring_buffer_event * __trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned long flags, int pc) { struct ring_buffer_event *event; event = ring_buffer_lock_reserve(buffer, len); if (event != NULL) trace_event_setup(event, type, flags, pc); return event; } void tracer_tracing_on(struct trace_array *tr) { if (tr->array_buffer.buffer) ring_buffer_record_on(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to * know if the ring buffer has been disabled, but it can handle * races of where it gets disabled but we still do a record. * As the check is in the fast path of the tracers, it is more * important to be fast than accurate. */ tr->buffer_disabled = 0; /* Make the flag seen by readers */ smp_wmb(); } /** * tracing_on - enable tracing buffers * * This function enables tracing buffers that may have been * disabled with tracing_off. */ void tracing_on(void) { tracer_tracing_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_on); static __always_inline void __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { __this_cpu_write(trace_taskinfo_save, true); /* If this is the temp buffer, we need to commit fully */ if (this_cpu_read(trace_buffered_event) == event) { /* Length is in event->array[0] */ ring_buffer_write(buffer, event->array[0], &event->array[1]); /* Release the temp buffer */ this_cpu_dec(trace_buffered_event_cnt); } else ring_buffer_unlock_commit(buffer, event); } /** * __trace_puts - write a constant string into the trace buffer. * @ip: The address of the caller * @str: The constant string to write * @size: The size of the string. */ int __trace_puts(unsigned long ip, const char *str, int size) { struct ring_buffer_event *event; struct trace_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; int alloc; int pc; if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; pc = preempt_count(); if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; alloc = sizeof(*entry) + size + 2; /* possible \n added */ local_save_flags(irq_flags); buffer = global_trace.array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, irq_flags, pc); if (!event) { size = 0; goto out; } entry = ring_buffer_event_data(event); entry->ip = ip; memcpy(&entry->buf, str, size); /* Add a newline if necessary */ if (entry->buf[size - 1] != '\n') { entry->buf[size] = '\n'; entry->buf[size + 1] = '\0'; } else entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); out: ring_buffer_nest_end(buffer); return size; } EXPORT_SYMBOL_GPL(__trace_puts); /** * __trace_bputs - write the pointer to a constant string into trace buffer * @ip: The address of the caller * @str: The constant string to write to the buffer to */ int __trace_bputs(unsigned long ip, const char *str) { struct ring_buffer_event *event; struct trace_buffer *buffer; struct bputs_entry *entry; unsigned long irq_flags; int size = sizeof(struct bputs_entry); int ret = 0; int pc; if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; pc = preempt_count(); if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; local_save_flags(irq_flags); buffer = global_trace.array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, irq_flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->str = str; __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); ret = 1; out: ring_buffer_nest_end(buffer); return ret; } EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT static void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data) { struct tracer *tracer = tr->current_trace; unsigned long flags; if (in_nmi()) { internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); internal_trace_puts("*** snapshot is being ignored ***\n"); return; } if (!tr->allocated_snapshot) { internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); internal_trace_puts("*** stopping trace here! ***\n"); tracing_off(); return; } /* Note, snapshot can not be used when the tracer uses it */ if (tracer->use_max_tr) { internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); return; } local_irq_save(flags); update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); } void tracing_snapshot_instance(struct trace_array *tr) { tracing_snapshot_instance_cond(tr, NULL); } /** * tracing_snapshot - take a snapshot of the current buffer. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live * trace when some condition is triggered, but continue to trace. * * Note, make sure to allocate the snapshot with either * a tracing_snapshot_alloc(), or by doing it manually * with: echo 1 > /sys/kernel/debug/tracing/snapshot * * If the snapshot buffer is not allocated, it will stop tracing. * Basically making a permanent snapshot. */ void tracing_snapshot(void) { struct trace_array *tr = &global_trace; tracing_snapshot_instance(tr); } EXPORT_SYMBOL_GPL(tracing_snapshot); /** * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. * @tr: The tracing instance to snapshot * @cond_data: The data to be tested conditionally, and possibly saved * * This is the same as tracing_snapshot() except that the snapshot is * conditional - the snapshot will only happen if the * cond_snapshot.update() implementation receiving the cond_data * returns true, which means that the trace array's cond_snapshot * update() operation used the cond_data to determine whether the * snapshot should be taken, and if it was, presumably saved it along * with the snapshot. */ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) { tracing_snapshot_instance_cond(tr, cond_data); } EXPORT_SYMBOL_GPL(tracing_snapshot_cond); /** * tracing_snapshot_cond_data - get the user data associated with a snapshot * @tr: The tracing instance * * When the user enables a conditional snapshot using * tracing_snapshot_cond_enable(), the user-defined cond_data is saved * with the snapshot. This accessor is used to retrieve it. * * Should not be called from cond_snapshot.update(), since it takes * the tr->max_lock lock, which the code calling * cond_snapshot.update() has already done. * * Returns the cond_data associated with the trace array's snapshot. */ void *tracing_cond_snapshot_data(struct trace_array *tr) { void *cond_data = NULL; local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) cond_data = tr->cond_snapshot->cond_data; arch_spin_unlock(&tr->max_lock); local_irq_enable(); return cond_data; } EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, struct array_buffer *size_buf, int cpu_id); static void set_buffer_entries(struct array_buffer *buf, unsigned long val); int tracing_alloc_snapshot_instance(struct trace_array *tr) { int ret; if (!tr->allocated_snapshot) { /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, RING_BUFFER_ALL_CPUS); if (ret < 0) return ret; tr->allocated_snapshot = true; } return 0; } static void free_snapshot(struct trace_array *tr) { /* * We don't free the ring buffer. instead, resize it because * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&tr->max_buffer, 1); tracing_reset_online_cpus(&tr->max_buffer); tr->allocated_snapshot = false; } /** * tracing_alloc_snapshot - allocate snapshot buffer. * * This only allocates the snapshot buffer if it isn't already * allocated - it doesn't also take a snapshot. * * This is meant to be used in cases where the snapshot buffer needs * to be set up for events that can't sleep but need to be able to * trigger a snapshot. */ int tracing_alloc_snapshot(void) { struct trace_array *tr = &global_trace; int ret; ret = tracing_alloc_snapshot_instance(tr); WARN_ON(ret < 0); return ret; } EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); /** * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. * * This is similar to tracing_snapshot(), but it will allocate the * snapshot buffer if it isn't already allocated. Use this only * where it is safe to sleep, as the allocation may sleep. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live * trace when some condition is triggered, but continue to trace. */ void tracing_snapshot_alloc(void) { int ret; ret = tracing_alloc_snapshot(); if (ret < 0) return; tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); /** * tracing_snapshot_cond_enable - enable conditional snapshot for an instance * @tr: The tracing instance * @cond_data: User data to associate with the snapshot * @update: Implementation of the cond_snapshot update function * * Check whether the conditional snapshot for the given instance has * already been enabled, or if the current tracer is already using a * snapshot; if so, return -EBUSY, else create a cond_snapshot and * save the cond_data and update function inside. * * Returns 0 if successful, error otherwise. */ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) { struct cond_snapshot *cond_snapshot; int ret = 0; cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); if (!cond_snapshot) return -ENOMEM; cond_snapshot->cond_data = cond_data; cond_snapshot->update = update; mutex_lock(&trace_types_lock); ret = tracing_alloc_snapshot_instance(tr); if (ret) goto fail_unlock; if (tr->current_trace->use_max_tr) { ret = -EBUSY; goto fail_unlock; } /* * The cond_snapshot can only change to NULL without the * trace_types_lock. We don't care if we race with it going * to NULL, but we want to make sure that it's not set to * something other than NULL when we get here, which we can * do safely with only holding the trace_types_lock and not * having to take the max_lock. */ if (tr->cond_snapshot) { ret = -EBUSY; goto fail_unlock; } local_irq_disable(); arch_spin_lock(&tr->max_lock); tr->cond_snapshot = cond_snapshot; arch_spin_unlock(&tr->max_lock); local_irq_enable(); mutex_unlock(&trace_types_lock); return ret; fail_unlock: mutex_unlock(&trace_types_lock); kfree(cond_snapshot); return ret; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); /** * tracing_snapshot_cond_disable - disable conditional snapshot for an instance * @tr: The tracing instance * * Check whether the conditional snapshot for the given instance is * enabled; if so, free the cond_snapshot associated with it, * otherwise return -EINVAL. * * Returns 0 if successful, error otherwise. */ int tracing_snapshot_cond_disable(struct trace_array *tr) { int ret = 0; local_irq_disable(); arch_spin_lock(&tr->max_lock); if (!tr->cond_snapshot) ret = -EINVAL; else { kfree(tr->cond_snapshot); tr->cond_snapshot = NULL; } arch_spin_unlock(&tr->max_lock); local_irq_enable(); return ret; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) { WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot_cond); int tracing_alloc_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); return -ENODEV; } EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); void *tracing_cond_snapshot_data(struct trace_array *tr) { return NULL; } EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) { return -ENODEV; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); int tracing_snapshot_cond_disable(struct trace_array *tr) { return false; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) { if (tr->array_buffer.buffer) ring_buffer_record_off(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to * know if the ring buffer has been disabled, but it can handle * races of where it gets disabled but we still do a record. * As the check is in the fast path of the tracers, it is more * important to be fast than accurate. */ tr->buffer_disabled = 1; /* Make the flag seen by readers */ smp_wmb(); } /** * tracing_off - turn off tracing buffers * * This function stops the tracing buffers from recording data. * It does not disable any overhead the tracers themselves may * be causing. This function simply causes all recording to * the ring buffers to fail. */ void tracing_off(void) { tracer_tracing_off(&global_trace); } EXPORT_SYMBOL_GPL(tracing_off); void disable_trace_on_warning(void) { if (__disable_trace_on_warning) { trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, "Disabling tracing due to warning\n"); tracing_off(); } } /** * tracer_tracing_is_on - show real state of ring buffer enabled * @tr : the trace array to know if ring buffer is enabled * * Shows real state of the ring buffer if it is enabled or not. */ bool tracer_tracing_is_on(struct trace_array *tr) { if (tr->array_buffer.buffer) return ring_buffer_record_is_on(tr->array_buffer.buffer); return !tr->buffer_disabled; } /** * tracing_is_on - show state of ring buffers enabled */ int tracing_is_on(void) { return tracer_tracing_is_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_is_on); static int __init set_buf_size(char *str) { unsigned long buf_size; if (!str) return 0; buf_size = memparse(str, &str); /* * nr_entries can not be zero and the startup * tests require some buffer space. Therefore * ensure we have at least 4096 bytes of buffer. */ trace_buf_size = max(4096UL, buf_size); return 1; } __setup("trace_buf_size=", set_buf_size); static int __init set_tracing_thresh(char *str) { unsigned long threshold; int ret; if (!str) return 0; ret = kstrtoul(str, 0, &threshold); if (ret < 0) return 0; tracing_thresh = threshold * 1000; return 1; } __setup("tracing_thresh=", set_tracing_thresh); unsigned long nsecs_to_usecs(unsigned long nsecs) { return nsecs / 1000; } /* * TRACE_FLAGS is defined as a tuple matching bit masks with strings. * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list * of strings in the order that the evals (enum) were defined. */ #undef C #define C(a, b) b /* These must match the bit postions in trace_iterator_flags */ static const char *trace_options[] = { TRACE_FLAGS NULL }; static struct { u64 (*func)(void); const char *name; int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { { trace_clock_local, "local", 1 }, { trace_clock_global, "global", 1 }, { trace_clock_counter, "counter", 0 }, { trace_clock_jiffies, "uptime", 0 }, { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, { ktime_get_boot_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; bool trace_clock_in_ns(struct trace_array *tr) { if (trace_clocks[tr->clock_id].in_ns) return true; return false; } /* * trace_parser_get_init - gets the buffer for trace parser */ int trace_parser_get_init(struct trace_parser *parser, int size) { memset(parser, 0, sizeof(*parser)); parser->buffer = kmalloc(size, GFP_KERNEL); if (!parser->buffer) return 1; parser->size = size; return 0; } /* * trace_parser_put - frees the buffer for trace parser */ void trace_parser_put(struct trace_parser *parser) { kfree(parser->buffer); parser->buffer = NULL; } /* * trace_get_user - reads the user input string separated by space * (matched by isspace(ch)) * * For each string found the 'struct trace_parser' is updated, * and the function returns. * * Returns number of bytes read. * * See kernel/trace/trace.h for 'struct trace_parser' details. */ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, size_t cnt, loff_t *ppos) { char ch; size_t read = 0; ssize_t ret; if (!*ppos) trace_parser_clear(parser); ret = get_user(ch, ubuf++); if (ret) goto out; read++; cnt--; /* * The parser is not finished with the last write, * continue reading the user input without skipping spaces. */ if (!parser->cont) { /* skip white space */ while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) goto out; read++; cnt--; } parser->idx = 0; /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; ret = read; goto out; } } /* read the non-space input */ while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { ret = -EINVAL; goto out; } ret = get_user(ch, ubuf++); if (ret) goto out; read++; cnt--; } /* We either got finished input or we have to wait for another call. */ if (isspace(ch) || !ch) { parser->buffer[parser->idx] = 0; parser->cont = false; } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; /* Make sure the parsed string always terminates with '\0'. */ parser->buffer[parser->idx] = 0; } else { ret = -EINVAL; goto out; } *ppos += read; ret = read; out: return ret; } /* TODO add a seq_buf_to_buffer() */ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; if (trace_seq_used(s) <= s->seq.readpos) return -EBUSY; len = trace_seq_used(s) - s->seq.readpos; if (cnt > len) cnt = len; memcpy(buf, s->buffer + s->seq.readpos, cnt); s->seq.readpos += cnt; return cnt; } unsigned long __read_mostly tracing_thresh; static const struct file_operations tracing_max_lat_fops; #if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ defined(CONFIG_FSNOTIFY) static struct workqueue_struct *fsnotify_wq; static void latency_fsnotify_workfn(struct work_struct *work) { struct trace_array *tr = container_of(work, struct trace_array, fsnotify_work); fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); } static void latency_fsnotify_workfn_irq(struct irq_work *iwork) { struct trace_array *tr = container_of(iwork, struct trace_array, fsnotify_irqwork); queue_work(fsnotify_wq, &tr->fsnotify_work); } static void trace_create_maxlat_file(struct trace_array *tr, struct dentry *d_tracer) { INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, d_tracer, &tr->max_latency, &tracing_max_lat_fops); } __init static int latency_fsnotify_init(void) { fsnotify_wq = alloc_workqueue("tr_max_lat_wq", WQ_UNBOUND | WQ_HIGHPRI, 0); if (!fsnotify_wq) { pr_err("Unable to allocate tr_max_lat_wq\n"); return -ENOMEM; } return 0; } late_initcall_sync(latency_fsnotify_init); void latency_fsnotify(struct trace_array *tr) { if (!fsnotify_wq) return; /* * We cannot call queue_work(&tr->fsnotify_work) from here because it's * possible that we are called from __schedule() or do_idle(), which * could cause a deadlock. */ irq_work_queue(&tr->fsnotify_irqwork); } /* * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ * defined(CONFIG_FSNOTIFY) */ #else #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", 0644, d_tracer, \ &tr->max_latency, &tracing_max_lat_fops) #endif #ifdef CONFIG_TRACER_MAX_TRACE /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, * for later retrieval via /sys/kernel/tracing/tracing_max_latency) */ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct array_buffer *trace_buf = &tr->array_buffer; struct array_buffer *max_buf = &tr->max_buffer; struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); max_buf->cpu = cpu; max_buf->time_start = data->preempt_timestamp; max_data->saved_latency = tr->max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use * RCU. The irq tracer can be called out of RCU scope. */ if (tsk == current) max_data->uid = current_uid(); else max_data->uid = task_uid(tsk); max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; max_data->policy = tsk->policy; max_data->rt_priority = tsk->rt_priority; /* record this tasks comm */ tracing_record_cmdline(tsk); latency_fsnotify(tr); } /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer * @tsk: the task with the latency * @cpu: The cpu that initiated the trace. * @cond_data: User data associated with a conditional snapshot * * Flip the buffers between the @tr and the max_tr and record information * about which task was the cause of this latency. */ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data) { if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } arch_spin_lock(&tr->max_lock); /* Inherit the recordable setting from array_buffer */ if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) ring_buffer_record_on(tr->max_buffer.buffer); else ring_buffer_record_off(tr->max_buffer.buffer); #ifdef CONFIG_TRACER_SNAPSHOT if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) goto out_unlock; #endif swap(tr->array_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); out_unlock: arch_spin_unlock(&tr->max_lock); } /** * update_max_tr_single - only copy one trace over, and reset the rest * @tr: tracer * @tsk: task with the latency * @cpu: the cpu of the buffer to copy. * * Flip the trace of a single CPU buffer between the @tr and the max_tr. */ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { int ret; if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } arch_spin_lock(&tr->max_lock); ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); if (ret == -EBUSY) { /* * We failed to swap the buffer due to a commit taking * place on this CPU. We fail to record, but we reset * the max trace buffer (no one writes directly to it) * and flag that it failed. * Another reason is resize is in progress. */ trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, "Failed to swap buffers due to commit or resize in progress\n"); } WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); /* Any waiters on the old snapshot buffer need to wake up */ ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); } #endif /* CONFIG_TRACER_MAX_TRACE */ static int wait_on_pipe(struct trace_iterator *iter, int full) { int ret; /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full); #ifdef CONFIG_TRACER_MAX_TRACE /* * Make sure this is still the snapshot buffer, as if a snapshot were * to happen, this would now be the main buffer. */ if (iter->snapshot) iter->array_buffer = &iter->tr->max_buffer; #endif return ret; } #ifdef CONFIG_FTRACE_STARTUP_TEST static bool selftests_can_run; struct trace_selftests { struct list_head list; struct tracer *type; }; static LIST_HEAD(postponed_selftests); static int save_selftest(struct tracer *type) { struct trace_selftests *selftest; selftest = kmalloc(sizeof(*selftest), GFP_KERNEL); if (!selftest) return -ENOMEM; selftest->type = type; list_add(&selftest->list, &postponed_selftests); return 0; } static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; struct tracer *saved_tracer = tr->current_trace; int ret; if (!type->selftest || tracing_selftest_disabled) return 0; /* * If a tracer registers early in boot up (before scheduling is * initialized and such), then do not run its selftests yet. * Instead, run it a little later in the boot process. */ if (!selftests_can_run) return save_selftest(type); /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current * tracer to be this tracer. The tracer can then run some * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ tracing_reset_online_cpus(&tr->array_buffer); tr->current_trace = type; #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { /* If we expanded the buffers, make sure the max is expanded too */ if (ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); tr->allocated_snapshot = true; } #endif /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); /* the test is responsible for resetting too */ tr->current_trace = saved_tracer; if (ret) { printk(KERN_CONT "FAILED!\n"); /* Add the warning after printing 'FAILED' */ WARN_ON(1); return -1; } /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { tr->allocated_snapshot = false; /* Shrink the max buffer again */ if (ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); } #endif printk(KERN_CONT "PASSED\n"); return 0; } static __init int init_trace_selftests(void) { struct trace_selftests *p, *n; struct tracer *t, **last; int ret; selftests_can_run = true; mutex_lock(&trace_types_lock); if (list_empty(&postponed_selftests)) goto out; pr_info("Running postponed tracer tests:\n"); tracing_selftest_running = true; list_for_each_entry_safe(p, n, &postponed_selftests, list) { /* This loop can take minutes when sanitizers are enabled, so * lets make sure we allow RCU processing. */ cond_resched(); ret = run_tracer_selftest(p->type); /* If the test fails, then warn and remove from available_tracers */ if (ret < 0) { WARN(1, "tracer: %s failed selftest, disabling\n", p->type->name); last = &trace_types; for (t = trace_types; t; t = t->next) { if (t == p->type) { *last = t->next; break; } last = &t->next; } } list_del(&p->list); kfree(p); } tracing_selftest_running = false; out: mutex_unlock(&trace_types_lock); return 0; } core_initcall(init_trace_selftests); #else static inline int run_tracer_selftest(struct tracer *type) { return 0; } #endif /* CONFIG_FTRACE_STARTUP_TEST */ static void add_tracer_options(struct trace_array *tr, struct tracer *t); static void __init apply_trace_boot_options(void); /** * register_tracer - register a tracer with the ftrace system. * @type: the plugin for the tracer * * Register a new plugin tracer. */ int __init register_tracer(struct tracer *type) { struct tracer *t; int ret = 0; if (!type->name) { pr_info("Tracer must have a name\n"); return -1; } if (strlen(type->name) >= MAX_TRACER_SIZE) { pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); return -1; } if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Can not register tracer %s due to lockdown\n", type->name); return -EPERM; } mutex_lock(&trace_types_lock); tracing_selftest_running = true; for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ pr_info("Tracer %s already registered\n", type->name); ret = -1; goto out; } } if (!type->set_flag) type->set_flag = &dummy_set_flag; if (!type->flags) { /*allocate a dummy tracer_flags*/ type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); if (!type->flags) { ret = -ENOMEM; goto out; } type->flags->val = 0; type->flags->opts = dummy_tracer_opt; } else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; /* store the tracer for __set_tracer_option */ type->flags->trace = type; ret = run_tracer_selftest(type); if (ret < 0) goto out; type->next = trace_types; trace_types = type; add_tracer_options(&global_trace, type); out: tracing_selftest_running = false; mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) goto out_unlock; if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) goto out_unlock; printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ tracing_set_tracer(&global_trace, type->name); default_bootup_tracer = NULL; apply_trace_boot_options(); /* disable other selftests, since this will break it. */ disable_tracing_selftest("running a tracer"); out_unlock: return ret; } static void tracing_reset_cpu(struct array_buffer *buf, int cpu) { struct trace_buffer *buffer = buf->buffer; if (!buffer) return; ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_rcu(); ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); } void tracing_reset_online_cpus(struct array_buffer *buf) { struct trace_buffer *buffer = buf->buffer; if (!buffer) return; ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_rcu(); buf->time_start = buffer_ftrace_now(buf, buf->cpu); ring_buffer_reset_online_cpus(buffer); ring_buffer_record_enable(buffer); } /* Must have trace_types_lock held */ void tracing_reset_all_online_cpus_unlocked(void) { struct trace_array *tr; lockdep_assert_held(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->clear_trace) continue; tr->clear_trace = false; tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); #endif } } void tracing_reset_all_online_cpus(void) { mutex_lock(&trace_types_lock); tracing_reset_all_online_cpus_unlocked(); mutex_unlock(&trace_types_lock); } /* * The tgid_map array maps from pid to tgid; i.e. the value stored at index i * is the tgid last observed corresponding to pid=i. */ static int *tgid_map; /* The maximum valid index into tgid_map. */ static size_t tgid_map_max; #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX /* * Preemption must be disabled before acquiring trace_cmdline_lock. * The various trace_arrays' max_lock must be acquired in a context * where interrupt is disabled. */ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; unsigned *map_cmdline_to_pid; unsigned cmdline_num; int cmdline_idx; char saved_cmdlines[]; }; static struct saved_cmdlines_buffer *savedcmd; static inline char *get_saved_cmdlines(int idx) { return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; } static inline void set_cmdline(int idx, const char *cmdline) { strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); kfree(s->map_cmdline_to_pid); kmemleak_free(s); free_pages((unsigned long)s, order); } static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) { struct saved_cmdlines_buffer *s; struct page *page; int orig_size, size; int order; /* Figure out how much is needed to hold the given number of cmdlines */ orig_size = sizeof(*s) + val * TASK_COMM_LEN; order = get_order(orig_size); size = 1 << (order + PAGE_SHIFT); page = alloc_pages(GFP_KERNEL, order); if (!page) return NULL; s = page_address(page); kmemleak_alloc(s, size, 1, GFP_KERNEL); memset(s, 0, sizeof(*s)); /* Round up to actual allocation */ val = (size - sizeof(*s)) / TASK_COMM_LEN; s->cmdline_num = val; s->map_cmdline_to_pid = kmalloc_array(val, sizeof(*s->map_cmdline_to_pid), GFP_KERNEL); if (!s->map_cmdline_to_pid) { free_saved_cmdlines_buffer(s); return NULL; } s->cmdline_idx = 0; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); return s; } static int trace_create_savedcmd(void) { savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); return savedcmd ? 0 : -ENOMEM; } int is_tracing_stopped(void) { return global_trace.stop_count; } static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; if (tracing_disabled) return; raw_spin_lock_irqsave(&tr->start_lock, flags); if (--tr->stop_count) { if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ tr->stop_count = 0; } goto out; } /* Prevent the buffers from switching */ arch_spin_lock(&tr->max_lock); buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #endif arch_spin_unlock(&tr->max_lock); out: raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** * tracing_start - quick start of the tracer * * If tracing is enabled but was stopped by tracing_stop, * this will start the tracer back up. */ void tracing_start(void) { return tracing_start_tr(&global_trace); } static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; raw_spin_lock_irqsave(&tr->start_lock, flags); if (tr->stop_count++) goto out; /* Prevent the buffers from switching */ arch_spin_lock(&tr->max_lock); buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #endif arch_spin_unlock(&tr->max_lock); out: raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** * tracing_stop - quick stop of the tracer * * Light weight way to stop tracing. Use in conjunction with * tracing_start. */ void tracing_stop(void) { return tracing_stop_tr(&global_trace); } static int trace_save_cmdline(struct task_struct *tsk) { unsigned tpid, idx; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* * It's not the end of the world if we don't get * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. * * This is called within the scheduler and wake up, so interrupts * had better been disabled and run queue lock been held. */ lockdep_assert_preemption_disabled(); if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; idx = savedcmd->map_pid_to_cmdline[tpid]; if (idx == NO_CMDLINE_MAP) { idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; savedcmd->map_pid_to_cmdline[tpid] = idx; savedcmd->cmdline_idx = idx; } savedcmd->map_cmdline_to_pid[idx] = tsk->pid; set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); return 1; } static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; int tpid; if (!pid) { strcpy(comm, "<idle>"); return; } if (WARN_ON_ONCE(pid < 0)) { strcpy(comm, "<XXX>"); return; } tpid = pid & (PID_MAX_DEFAULT - 1); map = savedcmd->map_pid_to_cmdline[tpid]; if (map != NO_CMDLINE_MAP) { tpid = savedcmd->map_cmdline_to_pid[map]; if (tpid == pid) { strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); return; } } strcpy(comm, "<...>"); } void trace_find_cmdline(int pid, char comm[]) { preempt_disable(); arch_spin_lock(&trace_cmdline_lock); __trace_find_cmdline(pid, comm); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); } static int *trace_find_tgid_ptr(int pid) { /* * Pairs with the smp_store_release in set_tracer_flag() to ensure that * if we observe a non-NULL tgid_map then we also observe the correct * tgid_map_max. */ int *map = smp_load_acquire(&tgid_map); if (unlikely(!map || pid > tgid_map_max)) return NULL; return &map[pid]; } int trace_find_tgid(int pid) { int *ptr = trace_find_tgid_ptr(pid); return ptr ? *ptr : 0; } static int trace_save_tgid(struct task_struct *tsk) { int *ptr; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; ptr = trace_find_tgid_ptr(tsk->pid); if (!ptr) return 0; *ptr = tsk->tgid; return 1; } static bool tracing_record_taskinfo_skip(int flags) { if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) return true; if (!__this_cpu_read(trace_taskinfo_save)) return true; return false; } /** * tracing_record_taskinfo - record the task info of a task * * @task: task to record * @flags: TRACE_RECORD_CMDLINE for recording comm * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo(struct task_struct *task, int flags) { bool done; if (tracing_record_taskinfo_skip(flags)) return; /* * Record as much task information as possible. If some fail, continue * to try to record the others. */ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); /* If recording any information failed, retry again soon. */ if (!done) return; __this_cpu_write(trace_taskinfo_save, false); } /** * tracing_record_taskinfo_sched_switch - record task info for sched_switch * * @prev: previous task during sched_switch * @next: next task during sched_switch * @flags: TRACE_RECORD_CMDLINE for recording comm * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) { bool done; if (tracing_record_taskinfo_skip(flags)) return; /* * Record as much task information as possible. If some fail, continue * to try to record the others. */ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); /* If recording any information failed, retry again soon. */ if (!done) return; __this_cpu_write(trace_taskinfo_save, false); } /* Helpers to record a specific task information */ void tracing_record_cmdline(struct task_struct *task) { tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); } void tracing_record_tgid(struct task_struct *task) { tracing_record_taskinfo(task, TRACE_RECORD_TGID); } /* * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function * simplifies those functions and keeps them in sync. */ enum print_line_t trace_handle_return(struct trace_seq *s) { return trace_seq_has_overflowed(s) ? TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; } EXPORT_SYMBOL_GPL(trace_handle_return); void tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, unsigned long flags, int pc) { struct task_struct *tsk = current; entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; entry->type = type; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | #else TRACE_FLAG_IRQS_NOSUPPORT | #endif ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); struct ring_buffer_event * trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned long flags, int pc) { return __trace_buffer_lock_reserve(buffer, type, len, flags, pc); } DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); DEFINE_PER_CPU(int, trace_buffered_event_cnt); static int trace_buffered_event_ref; /** * trace_buffered_event_enable - enable buffering events * * When events are being filtered, it is quicker to use a temporary * buffer to write the event data into if there's a likely chance * that it will not be committed. The discard of the ring buffer * is not as fast as committing, and is much slower than copying * a commit. * * When an event is to be filtered, allocate per cpu buffers to * write the event data into, and if the event is filtered and discarded * it is simply dropped, otherwise, the entire data is to be committed * in one shot. */ void trace_buffered_event_enable(void) { struct ring_buffer_event *event; struct page *page; int cpu; WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); if (trace_buffered_event_ref++) return; for_each_tracing_cpu(cpu) { page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); /* This is just an optimization and can handle failures */ if (!page) { pr_err("Failed to allocate event buffer\n"); break; } event = page_address(page); memset(event, 0, sizeof(*event)); per_cpu(trace_buffered_event, cpu) = event; preempt_disable(); if (cpu == smp_processor_id() && __this_cpu_read(trace_buffered_event) != per_cpu(trace_buffered_event, cpu)) WARN_ON_ONCE(1); preempt_enable(); } } static void enable_trace_buffered_event(void *data) { /* Probably not needed, but do it anyway */ smp_rmb(); this_cpu_dec(trace_buffered_event_cnt); } static void disable_trace_buffered_event(void *data) { this_cpu_inc(trace_buffered_event_cnt); } /** * trace_buffered_event_disable - disable buffering events * * When a filter is removed, it is faster to not use the buffered * events, and to commit directly into the ring buffer. Free up * the temp buffers when there are no more users. This requires * special synchronization with current events. */ void trace_buffered_event_disable(void) { int cpu; WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); if (WARN_ON_ONCE(!trace_buffered_event_ref)) return; if (--trace_buffered_event_ref) return; /* For each CPU, set the buffer as used. */ on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event, NULL, true); /* Wait for all current users to finish */ synchronize_rcu(); for_each_tracing_cpu(cpu) { free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); per_cpu(trace_buffered_event, cpu) = NULL; } /* * Wait for all CPUs that potentially started checking if they can use * their event buffer only after the previous synchronize_rcu() call and * they still read a valid pointer from trace_buffered_event. It must be * ensured they don't see cleared trace_buffered_event_cnt else they * could wrongly decide to use the pointed-to buffer which is now freed. */ synchronize_rcu(); /* For each CPU, relinquish the buffer */ on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, true); } static struct trace_buffer *temp_buffer; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, struct trace_event_file *trace_file, int type, unsigned long len, unsigned long flags, int pc) { struct ring_buffer_event *entry; int val; *current_rb = trace_file->tr->array_buffer.buffer; if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); if ((len < (PAGE_SIZE - sizeof(*entry) - sizeof(entry->array[0]))) && val == 1) { trace_event_setup(entry, type, flags, pc); entry->array[0] = len; return entry; } this_cpu_dec(trace_buffered_event_cnt); } entry = __trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer * to store the trace event for the trigger to use. It's recursive * safe and will not be recorded anywhere. */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { *current_rb = temp_buffer; entry = __trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); } return entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock); static DEFINE_MUTEX(tracepoint_printk_mutex); static void output_printk(struct trace_event_buffer *fbuffer) { struct trace_event_call *event_call; struct trace_event_file *file; struct trace_event *event; unsigned long flags; struct trace_iterator *iter = tracepoint_print_iter; /* We should never get here if iter is NULL */ if (WARN_ON_ONCE(!iter)) return; event_call = fbuffer->trace_file->event_call; if (!event_call || !event_call->event.funcs || !event_call->event.funcs->trace) return; file = fbuffer->trace_file; if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && !filter_match_preds(file->filter, fbuffer->entry))) return; event = &fbuffer->trace_file->event_call->event; raw_spin_lock_irqsave(&tracepoint_iter_lock, flags); trace_seq_init(&iter->seq); iter->ent = fbuffer->entry; event_call->event.funcs->trace(iter, 0, event); trace_seq_putc(&iter->seq, 0); printk("%s", iter->seq.buffer); raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } int tracepoint_printk_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; int ret; mutex_lock(&tracepoint_printk_mutex); save_tracepoint_printk = tracepoint_printk; ret = proc_dointvec(table, write, buffer, lenp, ppos); /* * This will force exiting early, as tracepoint_printk * is always zero when tracepoint_printk_iter is not allocated */ if (!tracepoint_print_iter) tracepoint_printk = 0; if (save_tracepoint_printk == tracepoint_printk) goto out; if (tracepoint_printk) static_key_enable(&tracepoint_printk_key.key); else static_key_disable(&tracepoint_printk_key.key); out: mutex_unlock(&tracepoint_printk_mutex); return ret; } void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) { if (static_key_false(&tracepoint_printk_key.key)) output_printk(fbuffer); if (static_branch_unlikely(&trace_event_exports_enabled)) ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, fbuffer->flags, fbuffer->pc, fbuffer->regs); } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); /* * Skip 3: * * trace_buffer_unlock_commit_regs() * trace_event_buffer_commit() * trace_event_raw_event_xxx() */ # define STACK_SKIP 3 void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, struct pt_regs *regs) { __buffer_unlock_commit(buffer, event); /* * If regs is not set, then skip the necessary functions. * Note, we can still get here via blktrace, wakeup tracer * and mmiotrace, but that's ok if they lose a function or * two. They are not that meaningful. */ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); ftrace_trace_userstack(tr, buffer, flags, pc); } /* * Similar to trace_buffer_unlock_commit_regs() but do not dump stack. */ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event) { __buffer_unlock_commit(buffer, event); } void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { struct trace_event_call *call = &event_function; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), flags, pc); if (!event) return; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; if (!call_filter_check_discard(call, entry, buffer, event)) { if (static_branch_unlikely(&trace_function_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); } } #ifdef CONFIG_STACKTRACE /* Allow 4 levels of nesting: normal, softirq, irq, NMI */ #define FTRACE_KSTACK_NESTING 4 #define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING) struct ftrace_stack { unsigned long calls[FTRACE_KSTACK_ENTRIES]; }; struct ftrace_stacks { struct ftrace_stack stacks[FTRACE_KSTACK_NESTING]; }; static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); static void __ftrace_trace_stack(struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; unsigned int size, nr_entries; struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; /* * Add one, for this function and the call to save_stack_trace() * If regs is set, then these functions will not be in the way. */ #ifndef CONFIG_UNWINDER_ORC if (!regs) skip++; #endif preempt_disable_notrace(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; /* This should never happen. If it does, yell once and skip */ if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING)) goto out; /* * The above __this_cpu_inc_return() is 'atomic' cpu local. An * interrupt will either see the value pre increment or post * increment. If the interrupt happens pre increment it will have * restored the counter when it returns. We just need a barrier to * keep gcc from moving things around. */ barrier(); fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; size = ARRAY_SIZE(fstack->calls); if (regs) { nr_entries = stack_trace_save_regs(regs, fstack->calls, size, skip); } else { nr_entries = stack_trace_save(fstack->calls, size, skip); } size = nr_entries * sizeof(unsigned long); event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, (sizeof(*entry) - sizeof(entry->caller)) + size, flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); memcpy(&entry->caller, fstack->calls, size); entry->size = nr_entries; if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); preempt_enable_notrace(); } static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) { if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; __ftrace_trace_stack(buffer, flags, skip, pc, regs); } void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { struct trace_buffer *buffer = tr->array_buffer.buffer; if (rcu_is_watching()) { __ftrace_trace_stack(buffer, flags, skip, pc, NULL); return; } /* * When an NMI triggers, RCU is enabled via rcu_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI * triggered someplace critical, and rcu_irq_enter() should * not be called from NMI. */ if (unlikely(in_nmi())) return; rcu_irq_enter_irqson(); __ftrace_trace_stack(buffer, flags, skip, pc, NULL); rcu_irq_exit_irqson(); } /** * trace_dump_stack - record a stack back trace in the trace buffer * @skip: Number of functions to skip (helper handlers) */ void trace_dump_stack(int skip) { unsigned long flags; if (tracing_disabled || tracing_selftest_running) return; local_save_flags(flags); #ifndef CONFIG_UNWINDER_ORC /* Skip 1 to skip this function. */ skip++; #endif __ftrace_trace_stack(global_trace.array_buffer.buffer, flags, skip, preempt_count(), NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack); #ifdef CONFIG_USER_STACKTRACE_SUPPORT static DEFINE_PER_CPU(int, user_stack_count); static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned long flags, int pc) { struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) return; /* * NMIs can not handle page faults, even with fix ups. * The save user stack can (and often does) fault. */ if (unlikely(in_nmi())) return; /* * prevent recursion, since the user stack tracing may * trigger other kernel events. */ preempt_disable(); if (__this_cpu_read(user_stack_count)) goto out; __this_cpu_inc(user_stack_count); event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) goto out_drop_count; entry = ring_buffer_event_data(event); entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); out: preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned long flags, int pc) { } #endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ #endif /* CONFIG_STACKTRACE */ /* created for use with alloc_percpu */ struct trace_buffer_struct { int nesting; char buffer[4][TRACE_BUF_SIZE]; }; static struct trace_buffer_struct __percpu *trace_percpu_buffer; /* * Thise allows for lockless recording. If we're nested too deeply, then * this returns NULL. */ static char *get_trace_buf(void) { struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); if (!trace_percpu_buffer || buffer->nesting >= 4) return NULL; buffer->nesting++; /* Interrupts must see nesting incremented before we use the buffer */ barrier(); return &buffer->buffer[buffer->nesting - 1][0]; } static void put_trace_buf(void) { /* Don't let the decrement of nesting leak before this */ barrier(); this_cpu_dec(trace_percpu_buffer->nesting); } static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct __percpu *buffers; if (trace_percpu_buffer) return 0; buffers = alloc_percpu(struct trace_buffer_struct); if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) return -ENOMEM; trace_percpu_buffer = buffers; return 0; } static int buffers_allocated; void trace_printk_init_buffers(void) { if (buffers_allocated) return; if (alloc_percpu_trace_buffer()) return; /* trace_printk() is for debug use only. Don't use it in production. */ pr_warn("\n"); pr_warn("**********************************************************\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("** **\n"); pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); pr_warn("** **\n"); pr_warn("** This means that this is a DEBUG kernel and it is **\n"); pr_warn("** unsafe for production use. **\n"); pr_warn("** **\n"); pr_warn("** If you see this message and you are not debugging **\n"); pr_warn("** the kernel, report this immediately to your vendor! **\n"); pr_warn("** **\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); buffers_allocated = 1; /* * trace_printk_init_buffers() can be called by modules. * If that happens, then we need to start cmdline recording * directly here. If the global_trace.buffer is already * allocated here, then this was called by module code. */ if (global_trace.array_buffer.buffer) tracing_start_cmdline_record(); } EXPORT_SYMBOL_GPL(trace_printk_init_buffers); void trace_printk_start_comm(void) { /* Start tracing comms if trace printk is set */ if (!buffers_allocated) return; tracing_start_cmdline_record(); } static void trace_printk_start_stop_comm(int enabled) { if (!buffers_allocated) return; if (enabled) tracing_start_cmdline_record(); else tracing_stop_cmdline_record(); } /** * trace_vbprintk - write binary msg to tracing buffer * @ip: The address of the caller * @fmt: The string format to write to the buffer * @args: Arguments for @fmt */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { struct trace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_buffer *buffer; struct trace_array *tr = &global_trace; struct bprint_entry *entry; unsigned long flags; char *tbuffer; int len = 0, size, pc; if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); pc = preempt_count(); preempt_disable_notrace(); tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; goto out_nobuffer; } len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) goto out_put; local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); } out: ring_buffer_nest_end(buffer); out_put: put_trace_buf(); out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); return len; } EXPORT_SYMBOL_GPL(trace_vbprintk); __printf(3, 0) static int __trace_array_vprintk(struct trace_buffer *buffer, unsigned long ip, const char *fmt, va_list args) { struct trace_event_call *call = &event_print; struct ring_buffer_event *event; int len = 0, size, pc; struct print_entry *entry; unsigned long flags; char *tbuffer; if (tracing_disabled || tracing_selftest_running) return 0; /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); pc = preempt_count(); preempt_disable_notrace(); tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; goto out_nobuffer; } len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); local_save_flags(flags); size = sizeof(*entry) + len + 1; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } out: ring_buffer_nest_end(buffer); put_trace_buf(); out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); return len; } __printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } /** * trace_array_printk - Print a message to a specific instance * @tr: The instance trace_array descriptor * @ip: The instruction pointer that this is called from. * @fmt: The format to print (printf format) * * If a subsystem sets up its own instance, they have the right to * printk strings into their tracing instance buffer using this * function. Note, this function will not write into the top level * buffer (use trace_printk() for that), as writing into the top level * buffer should only have events that can be individually disabled. * trace_printk() is only used for debugging a kernel, and should not * be ever encorporated in normal use. * * trace_array_printk() can be used, as it will not add noise to the * top level tracing buffer. * * Note, trace_array_init_printk() must be called on @tr before this * can be used. */ __printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { int ret; va_list ap; if (!tr) return -ENOENT; /* This is only allowed for created instances */ if (tr == &global_trace) return 0; if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); return ret; } EXPORT_SYMBOL_GPL(trace_array_printk); /** * trace_array_init_printk - Initialize buffers for trace_array_printk() * @tr: The trace array to initialize the buffers for * * As trace_array_printk() only writes into instances, they are OK to * have in the kernel (unlike trace_printk()). This needs to be called * before trace_array_printk() can be used on a trace_array. */ int trace_array_init_printk(struct trace_array *tr) { if (!tr) return -ENOENT; /* This is only allowed for created instances */ if (tr == &global_trace) return -EINVAL; return alloc_percpu_trace_buffer(); } EXPORT_SYMBOL_GPL(trace_array_init_printk); __printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { int ret; va_list ap; if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); ret = __trace_array_vprintk(buffer, ip, fmt, ap); va_end(ap); return ret; } __printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); iter->idx++; if (buf_iter) ring_buffer_iter_advance(buf_iter); } static struct trace_entry * peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { event = ring_buffer_iter_peek(buf_iter, ts); if (lost_events) *lost_events = ring_buffer_iter_dropped(buf_iter) ? (unsigned long)-1 : 0; } else { event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts, lost_events); } if (event) { iter->ent_size = ring_buffer_event_length(event); return ring_buffer_event_data(event); } iter->ent_size = 0; return NULL; } static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, unsigned long *missing_events, u64 *ent_ts) { struct trace_buffer *buffer = iter->array_buffer->buffer; struct trace_entry *ent, *next = NULL; unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; int next_size = 0; int cpu; /* * If we are in a per_cpu trace file, don't bother by iterating over * all cpu and peek directly. */ if (cpu_file > RING_BUFFER_ALL_CPUS) { if (ring_buffer_empty_cpu(buffer, cpu_file)) return NULL; ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); if (ent_cpu) *ent_cpu = cpu_file; return ent; } for_each_tracing_cpu(cpu) { if (ring_buffer_empty_cpu(buffer, cpu)) continue; ent = peek_next_entry(iter, cpu, &ts, &lost_events); /* * Pick the entry with the smallest timestamp: */ if (ent && (!next || ts < next_ts)) { next = ent; next_cpu = cpu; next_ts = ts; next_lost = lost_events; next_size = iter->ent_size; } } iter->ent_size = next_size; if (ent_cpu) *ent_cpu = next_cpu; if (ent_ts) *ent_ts = next_ts; if (missing_events) *missing_events = next_lost; return next; } #define STATIC_FMT_BUF_SIZE 128 static char static_fmt_buf[STATIC_FMT_BUF_SIZE]; static char *trace_iter_expand_format(struct trace_iterator *iter) { char *tmp; if (iter->fmt == static_fmt_buf) return NULL; tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE, GFP_KERNEL); if (tmp) { iter->fmt_size += STATIC_FMT_BUF_SIZE; iter->fmt = tmp; } return tmp; } const char *trace_event_format(struct trace_iterator *iter, const char *fmt) { const char *p, *new_fmt; char *q; if (WARN_ON_ONCE(!fmt)) return fmt; p = fmt; new_fmt = q = iter->fmt; while (*p) { if (unlikely(q - new_fmt + 3 > iter->fmt_size)) { if (!trace_iter_expand_format(iter)) return fmt; q += iter->fmt - new_fmt; new_fmt = iter->fmt; } *q++ = *p++; /* Replace %p with %px */ if (p[-1] == '%') { if (p[0] == '%') { *q++ = *p++; } else if (p[0] == 'p' && !isalnum(p[1])) { *q++ = *p++; *q++ = 'x'; } } } *q = '\0'; return new_fmt; } #define STATIC_TEMP_BUF_SIZE 128 static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4); /* Find the next real entry, without updating the iterator itself */ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { /* __find_next_entry will reset ent_size */ int ent_size = iter->ent_size; struct trace_entry *entry; /* * If called from ftrace_dump(), then the iter->temp buffer * will be the static_temp_buf and not created from kmalloc. * If the entry size is greater than the buffer, we can * not save it. Just return NULL in that case. This is only * used to add markers when two consecutive events' time * stamps have a large delta. See trace_print_lat_context() */ if (iter->temp == static_temp_buf && STATIC_TEMP_BUF_SIZE < ent_size) return NULL; /* * The __find_next_entry() may call peek_next_entry(), which may * call ring_buffer_peek() that may make the contents of iter->ent * undefined. Need to copy iter->ent now. */ if (iter->ent && iter->ent != iter->temp) { if ((!iter->temp || iter->temp_size < iter->ent_size) && !WARN_ON_ONCE(iter->temp == static_temp_buf)) { void *temp; temp = kmalloc(iter->ent_size, GFP_KERNEL); if (!temp) return NULL; kfree(iter->temp); iter->temp = temp; iter->temp_size = iter->ent_size; } memcpy(iter->temp, iter->ent, iter->ent_size); iter->ent = iter->temp; } entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); /* Put back the original ent_size */ iter->ent_size = ent_size; return entry; } /* Find the next real entry, and increment the iterator to the next entry */ void *trace_find_next_entry_inc(struct trace_iterator *iter) { iter->ent = __find_next_entry(iter, &iter->cpu, &iter->lost_events, &iter->ts); if (iter->ent) trace_iterator_increment(iter); return iter->ent ? iter : NULL; } static void trace_consume(struct trace_iterator *iter) { ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, &iter->ts, &iter->lost_events); } static void *s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_iterator *iter = m->private; int i = (int)*pos; void *ent; WARN_ON_ONCE(iter->leftover); (*pos)++; /* can't go backwards */ if (iter->idx > i) return NULL; if (iter->idx < 0) ent = trace_find_next_entry_inc(iter); else ent = iter; while (ent && iter->idx < i) ent = trace_find_next_entry_inc(iter); iter->pos = *pos; return ent; } void tracing_iter_reset(struct trace_iterator *iter, int cpu) { struct ring_buffer_iter *buf_iter; unsigned long entries = 0; u64 ts; per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = 0; buf_iter = trace_buffer_iter(iter, cpu); if (!buf_iter) return; ring_buffer_iter_reset(buf_iter); /* * We could have the case with the max latency tracers * that a reset never took place on a cpu. This is evident * by the timestamp being before the start of the buffer. */ while (ring_buffer_iter_peek(buf_iter, &ts)) { if (ts >= iter->array_buffer->time_start) break; entries++; ring_buffer_iter_advance(buf_iter); } per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; } /* * The current tracer is copied to avoid a global locking * all around. */ static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; int cpu; /* * copy the tracer to avoid using a global lock all around. * iter->trace is a copy of current_trace, the pointer to the * name may be used instead of a strcmp(), as iter->trace->name * will point to the same string as current_trace->name. */ mutex_lock(&trace_types_lock); if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) { /* Close iter->trace before switching to the new current tracer */ if (iter->trace->close) iter->trace->close(iter); *iter->trace = *tr->current_trace; /* Reopen the new current tracer */ if (iter->trace->open) iter->trace->open(iter); } mutex_unlock(&trace_types_lock); #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return ERR_PTR(-EBUSY); #endif if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; iter->idx = -1; if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) tracing_iter_reset(iter, cpu); } else tracing_iter_reset(iter, cpu_file); iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; } else { /* * If we overflowed the seq_file before, then we want * to just reuse the trace_seq buffer again. */ if (iter->leftover) p = iter; else { l = *pos - 1; p = s_next(m, p, &l); } } trace_event_read_lock(); trace_access_lock(cpu_file); return p; } static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return; #endif trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } static void get_total_entries_cpu(struct array_buffer *buf, unsigned long *total, unsigned long *entries, int cpu) { unsigned long count; count = ring_buffer_entries_cpu(buf->buffer, cpu); /* * If this buffer has skipped entries, then we hold all * entries for the trace and we need to ignore the * ones before the time stamp. */ if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; /* total is the same as the entries */ *total = count; } else *total = count + ring_buffer_overrun_cpu(buf->buffer, cpu); *entries = count; } static void get_total_entries(struct array_buffer *buf, unsigned long *total, unsigned long *entries) { unsigned long t, e; int cpu; *total = 0; *entries = 0; for_each_tracing_cpu(cpu) { get_total_entries_cpu(buf, &t, &e, cpu); *total += t; *entries += e; } } unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) { unsigned long total, entries; if (!tr) tr = &global_trace; get_total_entries_cpu(&tr->array_buffer, &total, &entries, cpu); return entries; } unsigned long trace_total_entries(struct trace_array *tr) { unsigned long total, entries; if (!tr) tr = &global_trace; get_total_entries(&tr->array_buffer, &total, &entries); return entries; } static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n" "# / _-----=> irqs-off \n" "# | / _----=> need-resched \n" "# || / _---=> hardirq/softirq \n" "# ||| / _--=> preempt-depth \n" "# |||| / delay \n" "# cmd pid ||||| time | caller \n" "# \\ / ||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) { unsigned long total; unsigned long entries; get_total_entries(buf, &total, &entries); seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", entries, total, num_online_cpus()); seq_puts(m, "#\n"); } static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; print_event_info(buf, m); seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; const char *space = " "; int prec = tgid ? 12 : 2; print_event_info(buf, m); seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); seq_printf(m, "# %.*s||| / delay\n", prec, space); seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); } void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK); struct array_buffer *buf = iter->array_buffer; struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); struct tracer *type = iter->trace; unsigned long entries; unsigned long total; const char *name = "preemption"; name = type->name; get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); seq_puts(m, "# -----------------------------------" "---------------------------------\n"); seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" " (M:%s VP:%d, KP:%d, SP:%d HP:%d", nsecs_to_usecs(data->saved_latency), entries, total, buf->cpu, #if defined(CONFIG_PREEMPT_NONE) "server", #elif defined(CONFIG_PREEMPT_VOLUNTARY) "desktop", #elif defined(CONFIG_PREEMPT) "preempt", #elif defined(CONFIG_PREEMPT_RT) "preempt_rt", #else "unknown", #endif /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP seq_printf(m, " #P:%d)\n", num_online_cpus()); #else seq_puts(m, ")\n"); #endif seq_puts(m, "# -----------------\n"); seq_printf(m, "# | task: %.16s-%d " "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", data->comm, data->pid, from_kuid_munged(seq_user_ns(m), data->uid), data->nice, data->policy, data->rt_priority); seq_puts(m, "# -----------------\n"); if (data->critical_start) { seq_puts(m, "# => started at: "); seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); trace_print_seq(m, &iter->seq); seq_puts(m, "\n# => ended at: "); seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); trace_print_seq(m, &iter->seq); seq_puts(m, "\n#\n"); } seq_puts(m, "#\n"); } static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_array *tr = iter->tr; if (!(tr->trace_flags & TRACE_ITER_ANNOTATE)) return; if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; if (cpumask_available(iter->started) && cpumask_test_cpu(iter->cpu, iter->started)) return; if (per_cpu_ptr(iter->array_buffer->data, iter->cpu)->skipped_entries) return; if (cpumask_available(iter->started)) cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ if (iter->idx > 1) trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); } static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; struct trace_event *event; entry = iter->ent; test_cpu_buff_start(iter); event = ftrace_find_event(entry->type); if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { if (iter->iter_flags & TRACE_FILE_LAT_FMT) trace_print_lat_context(iter); else trace_print_context(iter); } if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; if (event) return event->funcs->trace(iter, sym_flags, event); trace_seq_printf(s, "Unknown type %d\n", entry->type); return trace_handle_return(s); } static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; event = ftrace_find_event(entry->type); if (event) return event->funcs->raw(iter, 0, event); trace_seq_printf(s, "%d ?\n", entry->type); return trace_handle_return(s); } static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_HEX_FIELD(s, entry->pid); SEQ_PUT_HEX_FIELD(s, iter->cpu); SEQ_PUT_HEX_FIELD(s, iter->ts); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); if (event) { enum print_line_t ret = event->funcs->hex(iter, 0, event); if (ret != TRACE_TYPE_HANDLED) return ret; } SEQ_PUT_FIELD(s, newline); return trace_handle_return(s); } static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_FIELD(s, entry->pid); SEQ_PUT_FIELD(s, iter->cpu); SEQ_PUT_FIELD(s, iter->ts); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); return event ? event->funcs->binary(iter, 0, event) : TRACE_TYPE_HANDLED; } int trace_empty(struct trace_iterator *iter) { struct ring_buffer_iter *buf_iter; int cpu; /* If we are looking at one CPU buffer, only check that one */ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { cpu = iter->cpu_file; buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) return 0; } return 1; } for_each_tracing_cpu(cpu) { buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) return 0; } } return 1; } /* Called with trace_event_read_lock() held. */ enum print_line_t print_trace_line(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; unsigned long trace_flags = tr->trace_flags; enum print_line_t ret; if (iter->lost_events) { if (iter->lost_events == (unsigned long)-1) trace_seq_printf(&iter->seq, "CPU:%d [LOST EVENTS]\n", iter->cpu); else trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", iter->cpu, iter->lost_events); if (trace_seq_has_overflowed(&iter->seq)) return TRACE_TYPE_PARTIAL_LINE; } if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); if (ret != TRACE_TYPE_UNHANDLED) return ret; } if (iter->ent->type == TRACE_BPUTS && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) return trace_print_bputs_msg_only(iter); if (iter->ent->type == TRACE_BPRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) return trace_print_bprintk_msg_only(iter); if (iter->ent->type == TRACE_PRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) return trace_print_printk_msg_only(iter); if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); if (trace_flags & TRACE_ITER_HEX) return print_hex_fmt(iter); if (trace_flags & TRACE_ITER_RAW) return print_raw_fmt(iter); return print_trace_fmt(iter); } void trace_latency_header(struct seq_file *m) { struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; if (iter->iter_flags & TRACE_FILE_LAT_FMT) print_trace_header(m, iter); if (!(tr->trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; unsigned long trace_flags = tr->trace_flags; if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) return; if (iter->iter_flags & TRACE_FILE_LAT_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; print_trace_header(m, iter); if (!(trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) print_func_help_header_irq(iter->array_buffer, m, trace_flags); else print_func_help_header(iter->array_buffer, m, trace_flags); } } } static void test_ftrace_alive(struct seq_file *m) { if (!ftrace_is_dead()) return; seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n" "# MAY BE MISSING FUNCTION EVENTS\n"); } #ifdef CONFIG_TRACER_MAX_TRACE static void show_snapshot_main_help(struct seq_file *m) { seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" "# Takes a snapshot of the main buffer.\n" "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" "# (Doesn't have to be '2' works with any number that\n" "# is not a '0' or '1')\n"); } static void show_snapshot_percpu_help(struct seq_file *m) { seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" "# Takes a snapshot of the main buffer for this cpu.\n"); #else seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" "# Must use main snapshot file to allocate.\n"); #endif seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" "# (Doesn't have to be '2' works with any number that\n" "# is not a '0' or '1')\n"); } static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { if (iter->tr->allocated_snapshot) seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); else seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); seq_puts(m, "# Snapshot commands:\n"); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) show_snapshot_main_help(m); else show_snapshot_percpu_help(m); } #else /* Should never be called */ static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } #endif static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; int ret; if (iter->ent == NULL) { if (iter->tr) { seq_printf(m, "# tracer: %s\n", iter->trace->name); seq_puts(m, "#\n"); test_ftrace_alive(m); } if (iter->snapshot && trace_empty(iter)) print_snapshot_help(m, iter); else if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); else trace_default_header(m); } else if (iter->leftover) { /* * If we filled the seq_file buffer earlier, we * want to just show it now. */ ret = trace_print_seq(m, &iter->seq); /* ret should this time be zero, but you never know */ iter->leftover = ret; } else { ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { iter->seq.full = 0; trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); } ret = trace_print_seq(m, &iter->seq); /* * If we overflow the seq_file buffer, then it will * ask us for this data again at start up. * Use that instead. * ret is 0 if seq_file write succeeded. * -1 otherwise. */ iter->leftover = ret; } return 0; } /* * Should be used after trace_array_get(), trace_types_lock * ensures that i_cdev was already initialized. */ static inline int tracing_get_cpu(struct inode *inode) { if (inode->i_cdev) /* See trace_create_cpu_file() */ return (long)inode->i_cdev - 1; return RING_BUFFER_ALL_CPUS; } static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show, }; static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int cpu; if (tracing_disabled) return ERR_PTR(-ENODEV); iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); if (!iter) return ERR_PTR(-ENOMEM); iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; /* * trace_find_next_entry() may need to save off iter->ent. * It will place it into the iter->temp buffer. As most * events are less than 128, allocate a buffer of that size. * If one is greater, then trace_find_next_entry() will * allocate a new buffer to adjust for the bigger iter->ent. * It's not critical if it fails to get allocated here. */ iter->temp = kmalloc(128, GFP_KERNEL); if (iter->temp) iter->temp_size = 128; /* * trace_event_printf() may need to modify given format * string to replace %p with %px so that it shows real address * instead of hash value. However, that is only for the event * tracing, other tracer may not need. Defer the allocation * until it is needed. */ iter->fmt = NULL; iter->fmt_size = 0; /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. */ mutex_lock(&trace_types_lock); iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL); if (!iter->trace) goto fail; *iter->trace = *tr->current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; iter->tr = tr; #ifdef CONFIG_TRACER_MAX_TRACE /* Currently only the top directory has a snapshot */ if (tr->current_trace->print_max || snapshot) iter->array_buffer = &tr->max_buffer; else #endif iter->array_buffer = &tr->array_buffer; iter->snapshot = snapshot; iter->pos = -1; iter->cpu_file = tracing_get_cpu(inode); mutex_init(&iter->mutex); /* Notify the tracer early; before we stop tracing. */ if (iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ if (ring_buffer_overruns(iter->array_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; /* * If pause-on-trace is enabled, then stop the trace while * dumping, unless this is the "snapshot" file */ if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE)) tracing_stop_tr(tr); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = ring_buffer_read_prepare(iter->array_buffer->buffer, cpu, GFP_KERNEL); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = ring_buffer_read_prepare(iter->array_buffer->buffer, cpu, GFP_KERNEL); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); } mutex_unlock(&trace_types_lock); return iter; fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); kfree(iter->temp); kfree(iter->buffer_iter); release: seq_release_private(inode, file); return ERR_PTR(-ENOMEM); } int tracing_open_generic(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; filp->private_data = inode->i_private; return 0; } bool tracing_is_disabled(void) { return (tracing_disabled) ? true: false; } /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. */ int tracing_open_generic_tr(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; filp->private_data = inode->i_private; return 0; } /* * The private pointer of the inode is the trace_event_file. * Update the tr ref count associated to it. */ int tracing_open_file_tr(struct inode *inode, struct file *filp) { struct trace_event_file *file = inode->i_private; int ret; ret = tracing_check_open_get_tr(file->tr); if (ret) return ret; mutex_lock(&event_mutex); /* Fail if the file is marked for removal */ if (file->flags & EVENT_FILE_FL_FREED) { trace_array_put(file->tr); ret = -ENODEV; } else { event_file_get(file); } mutex_unlock(&event_mutex); if (ret) return ret; filp->private_data = inode->i_private; return 0; } int tracing_release_file_tr(struct inode *inode, struct file *filp) { struct trace_event_file *file = inode->i_private; trace_array_put(file->tr); event_file_put(file); return 0; } static int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; if (!(file->f_mode & FMODE_READ)) { trace_array_put(tr); return 0; } /* Writes do not use seq_file */ iter = m->private; mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); } if (iter->trace && iter->trace->close) iter->trace->close(iter); if (!iter->snapshot && tr->stop_count) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); __trace_array_put(tr); mutex_unlock(&trace_types_lock); mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); kfree(iter->fmt); kfree(iter->temp); kfree(iter->trace); kfree(iter->buffer_iter); seq_release_private(inode, file); return 0; } static int tracing_release_generic_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return 0; } static int tracing_single_release_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return single_release(inode, file); } static int tracing_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); struct array_buffer *trace_buf = &tr->array_buffer; #ifdef CONFIG_TRACER_MAX_TRACE if (tr->current_trace->print_max) trace_buf = &tr->max_buffer; #endif if (cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(trace_buf); else tracing_reset_cpu(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; } if (ret < 0) trace_array_put(tr); return ret; } /* * Some tracers are not suitable for instance buffers. * A tracer is always available for the global array (toplevel) * or if it explicitly states that it is. */ static bool trace_ok_for_array(struct tracer *t, struct trace_array *tr) { return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; } /* Find the next tracer that this trace array may use */ static struct tracer * get_tracer_for_array(struct trace_array *tr, struct tracer *t) { while (t && !trace_ok_for_array(t, tr)) t = t->next; return t; } static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; struct tracer *t = v; (*pos)++; if (t) t = get_tracer_for_array(tr, t->next); return t; } static void *t_start(struct seq_file *m, loff_t *pos) { struct trace_array *tr = m->private; struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); t = get_tracer_for_array(tr, trace_types); for (; t && l < *pos; t = t_next(m, t, &l)) ; return t; } static void t_stop(struct seq_file *m, void *p) { mutex_unlock(&trace_types_lock); } static int t_show(struct seq_file *m, void *v) { struct tracer *t = v; if (!t) return 0; seq_puts(m, t->name); if (t->next) seq_putc(m, ' '); else seq_putc(m, '\n'); return 0; } static const struct seq_operations show_traces_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, .show = t_show, }; static int show_traces_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct seq_file *m; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = seq_open(file, &show_traces_seq_ops); if (ret) { trace_array_put(tr); return ret; } m = file->private_data; m->private = tr; return 0; } static int show_traces_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return seq_release(inode, file); } static ssize_t tracing_write_stub(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { return count; } loff_t tracing_lseek(struct file *file, loff_t offset, int whence) { int ret; if (file->f_mode & FMODE_READ) ret = seq_lseek(file, offset, whence); else file->f_pos = ret = 0; return ret; } static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .read_iter = seq_read_iter, .splice_read = generic_file_splice_read, .write = tracing_write_stub, .llseek = tracing_lseek, .release = tracing_release, }; static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .llseek = seq_lseek, .release = show_traces_release, }; static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; char *mask_str; int len; len = snprintf(NULL, 0, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)) + 1; mask_str = kmalloc(len, GFP_KERNEL); if (!mask_str) return -ENOMEM; len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); if (len >= count) { count = -EINVAL; goto out_err; } count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); out_err: kfree(mask_str); return count; } int tracing_set_cpumask(struct trace_array *tr, cpumask_var_t tracing_cpumask_new) { int cpu; if (!tr) return -EINVAL; local_irq_disable(); arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: */ if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); #endif } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); #endif } } arch_spin_unlock(&tr->max_lock); local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); return 0; } static ssize_t tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; cpumask_var_t tracing_cpumask_new; int err; if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); if (err) goto err_free; err = tracing_set_cpumask(tr, tracing_cpumask_new); if (err) goto err_free; free_cpumask_var(tracing_cpumask_new); return count; err_free: free_cpumask_var(tracing_cpumask_new); return err; } static const struct file_operations tracing_cpumask_fops = { .open = tracing_open_generic_tr, .read = tracing_cpumask_read, .write = tracing_cpumask_write, .release = tracing_release_generic_tr, .llseek = generic_file_llseek, }; static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; struct trace_array *tr = m->private; u32 tracer_flags; int i; mutex_lock(&trace_types_lock); tracer_flags = tr->current_trace->flags->val; trace_opts = tr->current_trace->flags->opts; for (i = 0; trace_options[i]; i++) { if (tr->trace_flags & (1 << i)) seq_printf(m, "%s\n", trace_options[i]); else seq_printf(m, "no%s\n", trace_options[i]); } for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) seq_printf(m, "%s\n", trace_opts[i].name); else seq_printf(m, "no%s\n", trace_opts[i].name); } mutex_unlock(&trace_types_lock); return 0; } static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { struct tracer *trace = tracer_flags->trace; int ret; ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; if (neg) tracer_flags->val &= ~opts->bit; else tracer_flags->val |= opts->bit; return 0; } /* Try to assign a tracer specific option */ static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { struct tracer *trace = tr->current_trace; struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; int i; for (i = 0; tracer_flags->opts[i].name; i++) { opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) return __set_tracer_option(tr, trace->flags, opts, neg); } return -EINVAL; } /* Some tracers require overwrite to stay enabled */ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) { if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) return -1; return 0; } int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { int *map; if ((mask == TRACE_ITER_RECORD_TGID) || (mask == TRACE_ITER_RECORD_CMD)) lockdep_assert_held(&event_mutex); /* do nothing if flag is already set */ if (!!(tr->trace_flags & mask) == !!enabled) return 0; /* Give the tracer a chance to approve the change */ if (tr->current_trace->flag_changed) if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; if (enabled) tr->trace_flags |= mask; else tr->trace_flags &= ~mask; if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); if (mask == TRACE_ITER_RECORD_TGID) { if (!tgid_map) { tgid_map_max = pid_max; map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), GFP_KERNEL); /* * Pairs with smp_load_acquire() in * trace_find_tgid_ptr() to ensure that if it observes * the tgid_map we just allocated then it also observes * the corresponding tgid_map_max value. */ smp_store_release(&tgid_map, map); } if (!tgid_map) { tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; return -ENOMEM; } trace_event_enable_tgid_record(enabled); } if (mask == TRACE_ITER_EVENT_FORK) trace_event_follow_fork(tr, enabled); if (mask == TRACE_ITER_FUNC_FORK) ftrace_pid_follow_fork(tr, enabled); if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); #endif } if (mask == TRACE_ITER_PRINTK) { trace_printk_start_stop_comm(enabled); trace_printk_control(enabled); } return 0; } int trace_set_options(struct trace_array *tr, char *option) { char *cmp; int neg = 0; int ret; size_t orig_len = strlen(option); int len; cmp = strstrip(option); len = str_has_prefix(cmp, "no"); if (len) neg = 1; cmp += len; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = match_string(trace_options, -1, cmp); /* If no option could be set, test the specific tracer options */ if (ret < 0) ret = set_tracer_option(tr, cmp, neg); else ret = set_tracer_flag(tr, 1 << ret, !neg); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); /* * If the first trailing whitespace is replaced with '\0' by strstrip, * turn it back into a space. */ if (orig_len > strlen(option)) option[strlen(option)] = ' '; return ret; } static void __init apply_trace_boot_options(void) { char *buf = trace_boot_options_buf; char *option; while (true) { option = strsep(&buf, ","); if (!option) break; if (*option) trace_set_options(&global_trace, option); /* Put back the comma to allow this to be called again */ if (buf) *(buf - 1) = ','; } } static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; char buf[64]; int ret; if (cnt >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; ret = trace_set_options(tr, buf); if (ret < 0) return ret; *ppos += cnt; return cnt; } static int tracing_trace_options_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = single_open(file, tracing_trace_options_show, inode->i_private); if (ret < 0) trace_array_put(tr); return ret; } static const struct file_operations tracing_iter_fops = { .open = tracing_trace_options_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_single_release_tr, .write = tracing_trace_options_write, }; static const char readme_msg[] = "tracing mini-HOWTO:\n\n" "# echo 0 > tracing_on : quick way to disable tracing\n" "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" " Important files:\n" " trace\t\t\t- The static contents of the buffer\n" "\t\t\t To clear the buffer write into this file: echo > trace\n" " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" " current_tracer\t- function and latency tracers\n" " available_tracers\t- list of configured tracers for current_tracer\n" " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" " trace_clock\t\t-change the clock used to order events\n" " local: Per cpu clock but may not be synced across CPUs\n" " global: Synced across CPUs but slows tracing down.\n" " counter: Not a clock, but just an increment\n" " uptime: Jiffy counter from time of boot\n" " perf: Same clock that perf events use\n" #ifdef CONFIG_X86_64 " x86-tsc: TSC cycle counter\n" #endif "\n timestamp_mode\t-view the mode used to timestamp events\n" " delta: Delta difference against a buffer-wide timestamp\n" " absolute: Absolute (standalone) timestamp\n" "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" " tracing_cpumask\t- Limit which CPUs to trace\n" " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" " trace_options\t\t- Set format or modify how tracing happens\n" "\t\t\t Disable an option by prefixing 'no' to the\n" "\t\t\t option name\n" " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE "\n available_filter_functions - list of functions that can be filtered on\n" " set_ftrace_filter\t- echo function name in here to only trace these\n" "\t\t\t functions\n" "\t accepts: func_full_name or glob-matching-pattern\n" "\t modules: Can select a group via module\n" "\t Format: :mod:<module-name>\n" "\t example: echo :mod:ext3 > set_ftrace_filter\n" "\t triggers: a command to perform when function is hit\n" "\t Format: <function>:<trigger>[:count]\n" "\t trigger: traceon, traceoff\n" "\t\t enable_event:<system>:<event>\n" "\t\t disable_event:<system>:<event>\n" #ifdef CONFIG_STACKTRACE "\t\t stacktrace\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" #endif "\t\t dump\n" "\t\t cpudump\n" "\t example: echo do_fault:traceoff > set_ftrace_filter\n" "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" "\t The first one will disable tracing every time do_fault is hit\n" "\t The second will disable tracing at most 3 times when do_trap is hit\n" "\t The first time do trap is hit and it disables tracing, the\n" "\t counter will decrement to 2. If tracing is already disabled,\n" "\t the counter will not decrement. It only decrements when the\n" "\t trigger did work\n" "\t To remove trigger without count:\n" "\t echo '!<function>:<trigger> > set_ftrace_filter\n" "\t To remove trigger with a count:\n" "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n" " set_ftrace_notrace\t- echo function name in here to never trace.\n" "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" "\t modules: Can select a group via module command :mod:\n" "\t Does not accept triggers\n" #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_TRACER " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" "\t\t (function)\n" " set_ftrace_notrace_pid\t- Write pid(s) to not function trace those pids\n" "\t\t (function)\n" #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n" " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT "\n snapshot\t\t- Like 'trace' but shows the content of the static\n" "\t\t\t snapshot buffer. Read the contents for more\n" "\t\t\t information\n" #endif #ifdef CONFIG_STACK_TRACER " stack_trace\t\t- Shows the max stack trace when active\n" " stack_max_size\t- Shows current max stack size that was traced\n" "\t\t\t Write into this file to reset the max size (trigger a\n" "\t\t\t new trace)\n" #ifdef CONFIG_DYNAMIC_FTRACE " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n" "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ #ifdef CONFIG_DYNAMIC_EVENTS " dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_KPROBE_EVENTS " kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_UPROBE_EVENTS " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" "\t Format: p[:[<group>/]<event>] <place> [<args>]\n" "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n" #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/]<event> <field> [<field>]\n" #endif "\t -:[<group>/]<event>\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n" #endif #ifdef CONFIG_UPROBE_EVENTS " place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n" #endif "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" #else "\t $stack<index>, $stack, $retval, $comm,\n" #endif "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" "\t <type>\\[<array-size>\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: <stype> <name>;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" "\t [unsigned] char/int/long\n" #endif #endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" " events/<system>/\t- Directory containing all trace events for <system>:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n" "\t\t\t events\n" " filter\t\t- If set, only events passing filter are traced\n" " events/<system>/<event>/\t- Directory containing control files for\n" "\t\t\t <event>:\n" " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n" " filter\t\t- If set, only events passing filter are traced\n" " trigger\t\t- If set, a command to perform when event is hit\n" "\t Format: <trigger>[:count][if <filter>]\n" "\t trigger: traceon, traceoff\n" "\t enable_event:<system>:<event>\n" "\t disable_event:<system>:<event>\n" #ifdef CONFIG_HIST_TRIGGERS "\t enable_hist:<system>:<event>\n" "\t disable_hist:<system>:<event>\n" #endif #ifdef CONFIG_STACKTRACE "\t\t stacktrace\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" #endif #ifdef CONFIG_HIST_TRIGGERS "\t\t hist (see below)\n" #endif "\t example: echo traceoff > events/block/block_unplug/trigger\n" "\t echo traceoff:3 > events/block/block_unplug/trigger\n" "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" "\t events/block/block_unplug/trigger\n" "\t The first disables tracing every time block_unplug is hit.\n" "\t The second disables tracing the first 3 times block_unplug is hit.\n" "\t The third enables the kmalloc event the first 3 times block_unplug\n" "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n" "\t Like function triggers, the counter is only decremented if it\n" "\t enabled or disabled tracing.\n" "\t To remove a trigger without a count:\n" "\t echo '!<trigger> > <system>/<event>/trigger\n" "\t To remove a trigger with a count:\n" "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" "\t Filters can be ignored when removing a trigger.\n" #ifdef CONFIG_HIST_TRIGGERS " hist trigger\t- If set, event hits are aggregated into a hash table\n" "\t Format: hist:keys=<field1[,field2,...]>\n" "\t [:values=<field1[,field2,...]>]\n" "\t [:sort=<field1[,field2,...]>]\n" "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" "\t [:<handler>.<action>]\n" "\t [if <filter>]\n\n" "\t Note, special fields can be used as well:\n" "\t common_timestamp - to record current timestamp\n" "\t common_cpu - to record the CPU the event happened on\n" "\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" "\t can be any field, or the special string 'stacktrace'.\n" "\t Compound keys consisting of up to two fields can be specified\n" "\t by the 'keys' keyword. Values must correspond to numeric\n" "\t fields. Sort keys consisting of up to two fields can be\n" "\t specified using the 'sort' keyword. The sort direction can\n" "\t be modified by appending '.descending' or '.ascending' to a\n" "\t sort field. The 'size' parameter can be used to specify more\n" "\t or fewer than the default 2048 entries for the hashtable size.\n" "\t If a hist trigger is given a name using the 'name' parameter,\n" "\t its histogram data will be shared with other triggers of the\n" "\t same name, and trigger hits will update this common data.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" "\t table in its entirety to stdout. If there are multiple hist\n" "\t triggers attached to an event, there will be a table for each\n" "\t trigger in the output. The table displayed for a named\n" "\t trigger will be the same as any other instance having the\n" "\t same name. The default format used to display a given field\n" "\t can be modified by appending any of the following modifiers\n" "\t to the field name, as applicable:\n\n" "\t .hex display a number as a hex value\n" "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" "\t .execname display a common_pid as a program name\n" "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" "\t .usecs display a common_timestamp in microseconds\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" "\t restart a paused hist trigger.\n\n" "\t The 'clear' parameter will clear the contents of a running\n" "\t hist trigger and leave its current paused/active state\n" "\t unchanged.\n\n" "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" "\t already-attached hist trigger. The syntax is analogous to\n" "\t the enable_event and disable_event triggers.\n\n" "\t Hist trigger handlers and actions are executed whenever a\n" "\t a histogram entry is added or updated. They take the form:\n\n" "\t <handler>.<action>\n\n" "\t The available handlers are:\n\n" "\t onmatch(matching.event) - invoke on addition or update\n" "\t onmax(var) - invoke if var exceeds current max\n" "\t onchange(var) - invoke action if var changes\n\n" "\t The available actions are:\n\n" "\t trace(<synthetic_event>,param list) - generate synthetic event\n" "\t save(field,...) - save current event fields\n" #ifdef CONFIG_TRACER_SNAPSHOT "\t snapshot() - snapshot the trace buffer\n\n" #endif #ifdef CONFIG_SYNTH_EVENTS " events/synthetic_events\t- Create/append/remove/show synthetic events\n" "\t Write into this file to define/undefine new synthetic events.\n" "\t example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n" #endif #endif ; static ssize_t tracing_readme_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { return simple_read_from_buffer(ubuf, cnt, ppos, readme_msg, strlen(readme_msg)); } static const struct file_operations tracing_readme_fops = { .open = tracing_open_generic, .read = tracing_readme_read, .llseek = generic_file_llseek, }; static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) { int pid = ++(*pos); return trace_find_tgid_ptr(pid); } static void *saved_tgids_start(struct seq_file *m, loff_t *pos) { int pid = *pos; return trace_find_tgid_ptr(pid); } static void saved_tgids_stop(struct seq_file *m, void *v) { } static int saved_tgids_show(struct seq_file *m, void *v) { int *entry = (int *)v; int pid = entry - tgid_map; int tgid = *entry; if (tgid == 0) return SEQ_SKIP; seq_printf(m, "%d %d\n", pid, tgid); return 0; } static const struct seq_operations tracing_saved_tgids_seq_ops = { .start = saved_tgids_start, .stop = saved_tgids_stop, .next = saved_tgids_next, .show = saved_tgids_show, }; static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; return seq_open(filp, &tracing_saved_tgids_seq_ops); } static const struct file_operations tracing_saved_tgids_fops = { .open = tracing_saved_tgids_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { unsigned int *ptr = v; if (*pos || m->count) ptr++; (*pos)++; for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; ptr++) { if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) continue; return ptr; } return NULL; } static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) { void *v; loff_t l = 0; preempt_disable(); arch_spin_lock(&trace_cmdline_lock); v = &savedcmd->map_cmdline_to_pid[0]; while (l <= *pos) { v = saved_cmdlines_next(m, v, &l); if (!v) return NULL; } return v; } static void saved_cmdlines_stop(struct seq_file *m, void *v) { arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); } static int saved_cmdlines_show(struct seq_file *m, void *v) { char buf[TASK_COMM_LEN]; unsigned int *pid = v; __trace_find_cmdline(*pid, buf); seq_printf(m, "%d %s\n", *pid, buf); return 0; } static const struct seq_operations tracing_saved_cmdlines_seq_ops = { .start = saved_cmdlines_start, .next = saved_cmdlines_next, .stop = saved_cmdlines_stop, .show = saved_cmdlines_show, }; static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; return seq_open(filp, &tracing_saved_cmdlines_seq_ops); } static const struct file_operations tracing_saved_cmdlines_fops = { .open = tracing_saved_cmdlines_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static ssize_t tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; int r; preempt_disable(); arch_spin_lock(&trace_cmdline_lock); r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static int tracing_resize_saved_cmdlines(unsigned int val) { struct saved_cmdlines_buffer *s, *savedcmd_temp; s = allocate_cmdlines_buffer(val); if (!s) return -ENOMEM; preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; savedcmd = s; arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); free_saved_cmdlines_buffer(savedcmd_temp); return 0; } static ssize_t tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; /* must have at least 1 entry or less than PID_MAX_DEFAULT */ if (!val || val > PID_MAX_DEFAULT) return -EINVAL; ret = tracing_resize_saved_cmdlines((unsigned int)val); if (ret < 0) return ret; *ppos += cnt; return cnt; } static const struct file_operations tracing_saved_cmdlines_size_fops = { .open = tracing_open_generic, .read = tracing_saved_cmdlines_size_read, .write = tracing_saved_cmdlines_size_write, }; #ifdef CONFIG_TRACE_EVAL_MAP_FILE static union trace_eval_map_item * update_eval_map(union trace_eval_map_item *ptr) { if (!ptr->map.eval_string) { if (ptr->tail.next) { ptr = ptr->tail.next; /* Set ptr to the next real item (skip head) */ ptr++; } else return NULL; } return ptr; } static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) { union trace_eval_map_item *ptr = v; /* * Paranoid! If ptr points to end, we don't want to increment past it. * This really should never happen. */ (*pos)++; ptr = update_eval_map(ptr); if (WARN_ON_ONCE(!ptr)) return NULL; ptr++; ptr = update_eval_map(ptr); return ptr; } static void *eval_map_start(struct seq_file *m, loff_t *pos) { union trace_eval_map_item *v; loff_t l = 0; mutex_lock(&trace_eval_mutex); v = trace_eval_maps; if (v) v++; while (v && l < *pos) { v = eval_map_next(m, v, &l); } return v; } static void eval_map_stop(struct seq_file *m, void *v) { mutex_unlock(&trace_eval_mutex); } static int eval_map_show(struct seq_file *m, void *v) { union trace_eval_map_item *ptr = v; seq_printf(m, "%s %ld (%s)\n", ptr->map.eval_string, ptr->map.eval_value, ptr->map.system); return 0; } static const struct seq_operations tracing_eval_map_seq_ops = { .start = eval_map_start, .next = eval_map_next, .stop = eval_map_stop, .show = eval_map_show, }; static int tracing_eval_map_open(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; return seq_open(filp, &tracing_eval_map_seq_ops); } static const struct file_operations tracing_eval_map_fops = { .open = tracing_eval_map_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static inline union trace_eval_map_item * trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) { /* Return tail of array given the head */ return ptr + ptr->head.length + 1; } static void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **stop; struct trace_eval_map **map; union trace_eval_map_item *map_array; union trace_eval_map_item *ptr; stop = start + len; /* * The trace_eval_maps contains the map plus a head and tail item, * where the head holds the module and length of array, and the * tail holds a pointer to the next list. */ map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL); if (!map_array) { pr_warn("Unable to allocate trace eval mapping\n"); return; } mutex_lock(&trace_eval_mutex); if (!trace_eval_maps) trace_eval_maps = map_array; else { ptr = trace_eval_maps; for (;;) { ptr = trace_eval_jmp_to_tail(ptr); if (!ptr->tail.next) break; ptr = ptr->tail.next; } ptr->tail.next = map_array; } map_array->head.mod = mod; map_array->head.length = len; map_array++; for (map = start; (unsigned long)map < (unsigned long)stop; map++) { map_array->map = **map; map_array++; } memset(map_array, 0, sizeof(*map_array)); mutex_unlock(&trace_eval_mutex); } static void trace_create_eval_file(struct dentry *d_tracer) { trace_create_file("eval_map", 0444, d_tracer, NULL, &tracing_eval_map_fops); } #else /* CONFIG_TRACE_EVAL_MAP_FILE */ static inline void trace_create_eval_file(struct dentry *d_tracer) { } static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ static void trace_insert_eval_map(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **map; if (len <= 0) return; map = start; trace_event_eval_update(map, len); trace_insert_eval_map_file(mod, start, len); } static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+2]; int r; mutex_lock(&trace_types_lock); r = sprintf(buf, "%s\n", tr->current_trace->name); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } int tracer_init(struct tracer *t, struct trace_array *tr) { tracing_reset_online_cpus(&tr->array_buffer); return t->init(tr); } static void set_buffer_entries(struct array_buffer *buf, unsigned long val) { int cpu; for_each_tracing_cpu(cpu) per_cpu_ptr(buf->data, cpu)->entries = val; } static void update_buffer_entries(struct array_buffer *buf, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); } else { per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu); } } #ifdef CONFIG_TRACER_MAX_TRACE /* resize @tr's buffer to the size of @size_tr's entries */ static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, struct array_buffer *size_buf, int cpu_id) { int cpu, ret = 0; if (cpu_id == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { ret = ring_buffer_resize(trace_buf->buffer, per_cpu_ptr(size_buf->data, cpu)->entries, cpu); if (ret < 0) break; per_cpu_ptr(trace_buf->data, cpu)->entries = per_cpu_ptr(size_buf->data, cpu)->entries; } } else { ret = ring_buffer_resize(trace_buf->buffer, per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); if (ret == 0) per_cpu_ptr(trace_buf->data, cpu_id)->entries = per_cpu_ptr(size_buf->data, cpu_id)->entries; } return ret; } #endif /* CONFIG_TRACER_MAX_TRACE */ static int __tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu) { int ret; /* * If kernel or user changes the size of the ring buffer * we use the size that was given, and we can forget about * expanding it later. */ ring_buffer_expanded = true; /* May be called before buffers are initialized */ if (!tr->array_buffer.buffer) return 0; /* Do not allow tracing while resizing ring buffer */ tracing_stop_tr(tr); ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu); if (ret < 0) goto out_start; #ifdef CONFIG_TRACER_MAX_TRACE if (!tr->allocated_snapshot) goto out; ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); if (ret < 0) { int r = resize_buffer_duplicate_size(&tr->array_buffer, &tr->array_buffer, cpu); if (r < 0) { /* * AARGH! We are left with different * size max buffer!!!! * The max buffer is our "snapshot" buffer. * When a tracer needs a snapshot (one of the * latency tracers), it swaps the max buffer * with the saved snap shot. We succeeded to * update the size of the main buffer, but failed to * update the size of the max buffer. But when we tried * to reset the main buffer to the original size, we * failed there too. This is very unlikely to * happen, but if it does, warn and kill all * tracing. */ WARN_ON(1); tracing_disabled = 1; } goto out_start; } update_buffer_entries(&tr->max_buffer, cpu); out: #endif /* CONFIG_TRACER_MAX_TRACE */ update_buffer_entries(&tr->array_buffer, cpu); out_start: tracing_start_tr(tr); return ret; } ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id) { int ret = size; mutex_lock(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { /* make sure, this cpu is enabled in the mask */ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { ret = -EINVAL; goto out; } } ret = __tracing_resize_ring_buffer(tr, size, cpu_id); if (ret < 0) ret = -ENOMEM; out: mutex_unlock(&trace_types_lock); return ret; } /** * tracing_update_buffers - used by tracing facility to expand ring buffers * * To save on memory when the tracing is never used on a system with it * configured in. The ring buffers are set to a minimum size. But once * a user starts to use the tracing facility, then they need to grow * to their default size. * * This function is to be called when a tracer is about to be used. */ int tracing_update_buffers(void) { int ret = 0; mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, RING_BUFFER_ALL_CPUS); mutex_unlock(&trace_types_lock); return ret; } struct trace_option_dentry; static void create_trace_option_files(struct trace_array *tr, struct tracer *tracer); /* * Used to clear out the tracer before deletion of an instance. * Must have trace_types_lock held. */ static void tracing_set_nop(struct trace_array *tr) { if (tr->current_trace == &nop_trace) return; tr->current_trace->enabled--; if (tr->current_trace->reset) tr->current_trace->reset(tr); tr->current_trace = &nop_trace; } static bool tracer_options_updated; static void add_tracer_options(struct trace_array *tr, struct tracer *t) { /* Only enable if the directory has been created already. */ if (!tr->dir) return; /* Only create trace option files after update_tracer_options finish */ if (!tracer_options_updated) return; create_trace_option_files(tr, t); } int tracing_set_tracer(struct trace_array *tr, const char *buf) { struct tracer *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; #endif int ret = 0; mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) { ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; ret = 0; } for (t = trace_types; t; t = t->next) { if (strcmp(t->name, buf) == 0) break; } if (!t) { ret = -EINVAL; goto out; } if (t == tr->current_trace) goto out; #ifdef CONFIG_TRACER_SNAPSHOT if (t->use_max_tr) { local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) goto out; } #endif /* Some tracers won't work on kernel command line */ if (system_state < SYSTEM_RUNNING && t->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", t->name); goto out; } /* Some tracers are only allowed for the top level buffer */ if (!trace_ok_for_array(t, tr)) { ret = -EINVAL; goto out; } /* If trace pipe files are being read, we can't change the tracer */ if (tr->trace_ref) { ret = -EBUSY; goto out; } trace_branch_disable(); tr->current_trace->enabled--; if (tr->current_trace->reset) tr->current_trace->reset(tr); #ifdef CONFIG_TRACER_MAX_TRACE had_max_tr = tr->current_trace->use_max_tr; /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; if (had_max_tr && !t->use_max_tr) { /* * We need to make sure that the update_max_tr sees that * current_trace changed to nop_trace to keep it from * swapping the buffers after we resize it. * The update_max_tr is called from interrupts disabled * so a synchronized_sched() is sufficient. */ synchronize_rcu(); free_snapshot(tr); } if (t->use_max_tr && !tr->allocated_snapshot) { ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; } #else tr->current_trace = &nop_trace; #endif if (t->init) { ret = tracer_init(t, tr); if (ret) goto out; } tr->current_trace = t; tr->current_trace->enabled++; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); return ret; } static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; int i; size_t ret; int err; ret = cnt; if (cnt > MAX_TRACER_SIZE) cnt = MAX_TRACER_SIZE; if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; /* strip ending whitespace. */ for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) buf[i] = 0; err = tracing_set_tracer(tr, buf); if (err) return err; *ppos += ret; return ret; } static ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; int r; r = snprintf(buf, sizeof(buf), "%ld\n", *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); if (r > sizeof(buf)) r = sizeof(buf); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; *ptr = val * 1000; return cnt; } static ssize_t tracing_thresh_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos); } static ssize_t tracing_thresh_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; int ret; mutex_lock(&trace_types_lock); ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); if (ret < 0) goto out; if (tr->current_trace->update_thresh) { ret = tr->current_trace->update_thresh(tr); if (ret < 0) goto out; } ret = cnt; out: mutex_unlock(&trace_types_lock); return ret; } #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos); } static ssize_t tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); } #endif static int open_pipe_on_cpu(struct trace_array *tr, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { if (cpumask_empty(tr->pipe_cpumask)) { cpumask_setall(tr->pipe_cpumask); return 0; } } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { cpumask_set_cpu(cpu, tr->pipe_cpumask); return 0; } return -EBUSY; } static void close_pipe_on_cpu(struct trace_array *tr, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { WARN_ON(!cpumask_full(tr->pipe_cpumask)); cpumask_clear(tr->pipe_cpumask); } else { WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask)); cpumask_clear_cpu(cpu, tr->pipe_cpumask); } } static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int cpu; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; mutex_lock(&trace_types_lock); cpu = tracing_get_cpu(inode); ret = open_pipe_on_cpu(tr, cpu); if (ret) goto fail_pipe_on_cpu; /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; goto fail_alloc_iter; } trace_seq_init(&iter->seq); iter->trace = tr->current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; goto fail; } /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; iter->tr = tr; iter->array_buffer = &tr->array_buffer; iter->cpu_file = cpu; mutex_init(&iter->mutex); filp->private_data = iter; if (iter->trace->pipe_open) iter->trace->pipe_open(iter); nonseekable_open(inode, filp); tr->trace_ref++; mutex_unlock(&trace_types_lock); return ret; fail: kfree(iter); fail_alloc_iter: close_pipe_on_cpu(tr, cpu); fail_pipe_on_cpu: __trace_array_put(tr); mutex_unlock(&trace_types_lock); return ret; } static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; struct trace_array *tr = inode->i_private; mutex_lock(&trace_types_lock); tr->trace_ref--; if (iter->trace->pipe_close) iter->trace->pipe_close(iter); close_pipe_on_cpu(tr, iter->cpu_file); mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); kfree(iter->temp); mutex_destroy(&iter->mutex); kfree(iter); trace_array_put(tr); return 0; } static __poll_t trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { struct trace_array *tr = iter->tr; /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return EPOLLIN | EPOLLRDNORM; if (tr->trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ return EPOLLIN | EPOLLRDNORM; else return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file, filp, poll_table, iter->tr->buffer_percent); } static __poll_t tracing_poll_pipe(struct file *filp, poll_table *poll_table) { struct trace_iterator *iter = filp->private_data; return trace_poll(iter, filp, poll_table); } /* Must be called with iter->mutex held. */ static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; int ret; while (trace_empty(iter)) { if ((filp->f_flags & O_NONBLOCK)) { return -EAGAIN; } /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never * read anything. This allows a user to cat this file, and * then enable tracing. But after we have read something, * we give an EOF when tracing is again disabled. * * iter->pos will be 0 if we haven't read anything. */ if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; mutex_unlock(&iter->mutex); ret = wait_on_pipe(iter, 0); mutex_lock(&iter->mutex); if (ret) return ret; } return 1; } /* * Consumer reader. */ static ssize_t tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; ssize_t sret; /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself * is protected. */ mutex_lock(&iter->mutex); /* return any leftover data */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (sret != -EBUSY) goto out; trace_seq_init(&iter->seq); if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) goto out; } waitagain: sret = tracing_wait_pipe(filp); if (sret <= 0) goto out; /* stop when tracing is finished */ if (trace_empty(iter)) { sret = 0; goto out; } if (cnt >= PAGE_SIZE) cnt = PAGE_SIZE - 1; /* reset all but tr, trace, and overruns */ memset(&iter->seq, 0, sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); cpumask_clear(iter->started); trace_seq_init(&iter->seq); iter->pos = -1; trace_event_read_lock(); trace_access_lock(iter->cpu_file); while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int save_len = iter->seq.seq.len; ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { /* * If one print_trace_line() fills entire trace_seq in one shot, * trace_seq_to_user() will returns -EBUSY because save_len == 0, * In this case, we need to consume it, otherwise, loop will peek * this event next time, resulting in an infinite loop. */ if (save_len == 0) { iter->seq.full = 0; trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); trace_consume(iter); break; } /* In other cases, don't print partial lines */ iter->seq.seq.len = save_len; break; } if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); if (trace_seq_used(&iter->seq) >= cnt) break; /* * Setting the full flag means we reached the trace_seq buffer * size and we should leave by partial output condition above. * One of the trace_seq_* functions is not used properly. */ WARN_ONCE(iter->seq.full, "full flag set for trace type %d", iter->ent->type); } trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) trace_seq_init(&iter->seq); /* * If there was nothing to send to user, in spite of consuming trace * entries, go back to wait for more entries. */ if (sret == -EBUSY) goto waitagain; out: mutex_unlock(&iter->mutex); return sret; } static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, unsigned int idx) { __free_page(spd->pages[idx]); } static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { size_t count; int save_len; int ret; /* Seq buffer is page-sized, exactly what we need. */ for (;;) { save_len = iter->seq.seq.len; ret = print_trace_line(iter); if (trace_seq_has_overflowed(&iter->seq)) { iter->seq.seq.len = save_len; break; } /* * This should not be hit, because it should only * be set if the iter->seq overflowed. But check it * anyway to be safe. */ if (ret == TRACE_TYPE_PARTIAL_LINE) { iter->seq.seq.len = save_len; break; } count = trace_seq_used(&iter->seq) - save_len; if (rem < count) { rem = 0; iter->seq.seq.len = save_len; break; } if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); rem -= count; if (!trace_find_next_entry_inc(iter)) { rem = 0; iter->ent = NULL; break; } } return rem; } static ssize_t tracing_splice_read_pipe(struct file *filp, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct page *pages_def[PIPE_DEF_BUFFERS]; struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct trace_iterator *iter = filp->private_data; struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .nr_pages_max = PIPE_DEF_BUFFERS, .ops = &default_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; ssize_t ret; size_t rem; unsigned int i; if (splice_grow_spd(pipe, &spd)) return -ENOMEM; mutex_lock(&iter->mutex); if (iter->trace->splice_read) { ret = iter->trace->splice_read(iter, filp, ppos, pipe, len, flags); if (ret) goto out_err; } ret = tracing_wait_pipe(filp); if (ret <= 0) goto out_err; if (!iter->ent && !trace_find_next_entry_inc(iter)) { ret = -EFAULT; goto out_err; } trace_event_read_lock(); trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { spd.pages[i] = alloc_page(GFP_KERNEL); if (!spd.pages[i]) break; rem = tracing_fill_pipe_page(rem, iter); /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), trace_seq_used(&iter->seq)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; spd.partial[i].len = trace_seq_used(&iter->seq); trace_seq_init(&iter->seq); } trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); mutex_unlock(&iter->mutex); spd.nr_pages = i; if (i) ret = splice_to_pipe(pipe, &spd); else ret = 0; out: splice_shrink_spd(&spd); return ret; out_err: mutex_unlock(&iter->mutex); goto out; } static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct inode *inode = file_inode(filp); struct trace_array *tr = inode->i_private; int cpu = tracing_get_cpu(inode); char buf[64]; int r = 0; ssize_t ret; mutex_lock(&trace_types_lock); if (cpu == RING_BUFFER_ALL_CPUS) { int cpu, buf_size_same; unsigned long size; size = 0; buf_size_same = 1; /* check if all cpu sizes are same */ for_each_tracing_cpu(cpu) { /* fill in the size from first enabled cpu */ if (size == 0) size = per_cpu_ptr(tr->array_buffer.data, cpu)->entries; if (size != per_cpu_ptr(tr->array_buffer.data, cpu)->entries) { buf_size_same = 0; break; } } if (buf_size_same) { if (!ring_buffer_expanded) r = sprintf(buf, "%lu (expanded: %lu)\n", size >> 10, trace_buf_size >> 10); else r = sprintf(buf, "%lu\n", size >> 10); } else r = sprintf(buf, "X\n"); } else r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10); mutex_unlock(&trace_types_lock); ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); return ret; } static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct inode *inode = file_inode(filp); struct trace_array *tr = inode->i_private; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; /* must have at least 1 entry */ if (!val) return -EINVAL; /* value is in KB */ val <<= 10; ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode)); if (ret < 0) return ret; *ppos += cnt; return cnt; } static ssize_t tracing_total_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[64]; int r, cpu; unsigned long size = 0, expanded_size = 0; mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10; if (!ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } if (ring_buffer_expanded) r = sprintf(buf, "%lu\n", size); else r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { /* * There is no need to read what the user has written, this function * is just to make sure that there is no error when "echo" is used */ *ppos += cnt; return cnt; } static int tracing_free_buffer_release(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; /* disable tracing ? */ if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE) tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); trace_array_put(tr); return 0; } static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; ssize_t written; int size; int len; /* Used in tracing_mark_raw_write() as well */ #define FAULTED_STR "<faulted>" #define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ if (tracing_disabled) return -EINVAL; if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ /* If less than "<faulted>", then make sure we can still add that */ if (cnt < FAULTED_SIZE) size += FAULTED_SIZE - cnt; buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, irq_flags, preempt_count()); if (unlikely(!event)) /* Ring buffer disabled, return as if not open for write */ return -EBADF; entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); if (len) { memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; written = -EFAULT; } else written = cnt; if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ entry->buf[cnt] = '\0'; tt = event_triggers_call(tr->trace_marker_file, entry, event); } if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; entry->buf[cnt + 1] = '\0'; } else entry->buf[cnt] = '\0'; if (static_branch_unlikely(&trace_marker_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_MARKER); __buffer_unlock_commit(buffer, event); if (tt) event_triggers_post_call(tr->trace_marker_file, tt); if (written > 0) *fpos += written; return written; } /* Limit it for now to 3K (including tag) */ #define RAW_DATA_MAX_SIZE (1024*3) static ssize_t tracing_mark_raw_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; struct trace_buffer *buffer; struct raw_data_entry *entry; unsigned long irq_flags; ssize_t written; int size; int len; #define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) if (tracing_disabled) return -EINVAL; if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; /* The marker must at least have a tag id */ if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE) return -EINVAL; if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); local_save_flags(irq_flags); size = sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt; buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, irq_flags, preempt_count()); if (!event) /* Ring buffer disabled, return as if not open for write */ return -EBADF; entry = ring_buffer_event_data(event); len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); if (len) { entry->id = -1; memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); written = -EFAULT; } else written = cnt; __buffer_unlock_commit(buffer, event); if (written > 0) *fpos += written; return written; } static int tracing_clock_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) seq_printf(m, "%s%s%s%s", i ? " " : "", i == tr->clock_id ? "[" : "", trace_clocks[i].name, i == tr->clock_id ? "]" : ""); seq_putc(m, '\n'); return 0; } int tracing_set_clock(struct trace_array *tr, const char *clockstr) { int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { if (strcmp(trace_clocks[i].name, clockstr) == 0) break; } if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; mutex_lock(&trace_types_lock); tr->clock_id = i; ring_buffer_set_clock(tr->array_buffer.buffer, trace_clocks[i].func); /* * New clock may not be consistent with the previous clock. * Reset the buffer so that it doesn't have incomparable timestamps. */ tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); tracing_reset_online_cpus(&tr->max_buffer); #endif mutex_unlock(&trace_types_lock); return 0; } static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; char buf[64]; const char *clockstr; int ret; if (cnt >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; clockstr = strstrip(buf); ret = tracing_set_clock(tr, clockstr); if (ret) return ret; *fpos += cnt; return cnt; } static int tracing_clock_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = single_open(file, tracing_clock_show, inode->i_private); if (ret < 0) trace_array_put(tr); return ret; } static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; mutex_lock(&trace_types_lock); if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer)) seq_puts(m, "delta [absolute]\n"); else seq_puts(m, "[delta] absolute\n"); mutex_unlock(&trace_types_lock); return 0; } static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); if (ret < 0) trace_array_put(tr); return ret; } int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs) { int ret = 0; mutex_lock(&trace_types_lock); if (abs && tr->time_stamp_abs_ref++) goto out; if (!abs) { if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) { ret = -EINVAL; goto out; } if (--tr->time_stamp_abs_ref) goto out; } ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs); #ifdef CONFIG_TRACER_MAX_TRACE if (tr->max_buffer.buffer) ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs); #endif out: mutex_unlock(&trace_types_lock); return ret; } struct ftrace_buffer_info { struct trace_iterator iter; void *spare; unsigned int spare_cpu; unsigned int read; }; #ifdef CONFIG_TRACER_SNAPSHOT static int tracing_snapshot_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; struct seq_file *m; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, true); if (IS_ERR(iter)) ret = PTR_ERR(iter); } else { /* Writes still need the seq_file to hold the private data */ ret = -ENOMEM; m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) goto out; iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { kfree(m); goto out; } ret = 0; iter->tr = tr; iter->array_buffer = &tr->max_buffer; iter->cpu_file = tracing_get_cpu(inode); m->private = iter; file->private_data = m; } out: if (ret < 0) trace_array_put(tr); return ret; } static void tracing_swap_cpu_buffer(void *tr) { update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); } static ssize_t tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct seq_file *m = filp->private_data; struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; unsigned long val; int ret; ret = tracing_update_buffers(); if (ret < 0) return ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; mutex_lock(&trace_types_lock); if (tr->current_trace->use_max_tr) { ret = -EBUSY; goto out; } local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) goto out; switch (val) { case 0: if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { ret = -EINVAL; break; } if (tr->allocated_snapshot) free_snapshot(tr); break; case 1: /* Only allow per-cpu swap if the ring buffer supports it */ #ifndef CONFIG_RING_BUFFER_ALLOW_SWAP if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { ret = -EINVAL; break; } #endif if (tr->allocated_snapshot) ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, iter->cpu_file); else ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) break; /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { local_irq_disable(); update_max_tr(tr, current, smp_processor_id(), NULL); local_irq_enable(); } else { smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, (void *)tr, 1); } break; default: if (tr->allocated_snapshot) { if (iter->cpu_file == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->max_buffer); else tracing_reset_cpu(&tr->max_buffer, iter->cpu_file); } break; } if (ret >= 0) { *ppos += cnt; ret = cnt; } out: mutex_unlock(&trace_types_lock); return ret; } static int tracing_snapshot_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; int ret; ret = tracing_release(inode, file); if (file->f_mode & FMODE_READ) return ret; /* If write only, the seq_file is just a stub */ if (m) kfree(m->private); kfree(m); return 0; } static int tracing_buffers_open(struct inode *inode, struct file *filp); static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos); static int tracing_buffers_release(struct inode *inode, struct file *file); static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); static int snapshot_raw_open(struct inode *inode, struct file *filp) { struct ftrace_buffer_info *info; int ret; /* The following checks for tracefs lockdown */ ret = tracing_buffers_open(inode, filp); if (ret < 0) return ret; info = filp->private_data; if (info->iter.trace->use_max_tr) { tracing_buffers_release(inode, filp); return -EBUSY; } info->iter.snapshot = true; info->iter.array_buffer = &info->iter.tr->max_buffer; return ret; } #endif /* CONFIG_TRACER_SNAPSHOT */ static const struct file_operations tracing_thresh_fops = { .open = tracing_open_generic, .read = tracing_thresh_read, .write = tracing_thresh_write, .llseek = generic_file_llseek, }; #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, .write = tracing_max_lat_write, .llseek = generic_file_llseek, }; #endif static const struct file_operations set_tracer_fops = { .open = tracing_open_generic_tr, .read = tracing_set_trace_read, .write = tracing_set_trace_write, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_pipe_fops = { .open = tracing_open_pipe, .poll = tracing_poll_pipe, .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, .llseek = no_llseek, }; static const struct file_operations tracing_entries_fops = { .open = tracing_open_generic_tr, .read = tracing_entries_read, .write = tracing_entries_write, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_total_entries_fops = { .open = tracing_open_generic_tr, .read = tracing_total_entries_read, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_free_buffer_fops = { .open = tracing_open_generic_tr, .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, }; static const struct file_operations tracing_mark_fops = { .open = tracing_open_generic_tr, .write = tracing_mark_write, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_mark_raw_fops = { .open = tracing_open_generic_tr, .write = tracing_mark_raw_write, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations trace_clock_fops = { .open = tracing_clock_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_single_release_tr, .write = tracing_clock_write, }; static const struct file_operations trace_time_stamp_mode_fops = { .open = tracing_time_stamp_mode_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_single_release_tr, }; #ifdef CONFIG_TRACER_SNAPSHOT static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, .read = seq_read, .write = tracing_snapshot_write, .llseek = tracing_lseek, .release = tracing_snapshot_release, }; static const struct file_operations snapshot_raw_fops = { .open = snapshot_raw_open, .read = tracing_buffers_read, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, .llseek = no_llseek, }; #endif /* CONFIG_TRACER_SNAPSHOT */ #define TRACING_LOG_ERRS_MAX 8 #define TRACING_LOG_LOC_MAX 128 #define CMD_PREFIX " Command: " struct err_info { const char **errs; /* ptr to loc-specific array of err strings */ u8 type; /* index into errs -> specific err string */ u8 pos; /* MAX_FILTER_STR_VAL = 256 */ u64 ts; }; struct tracing_log_err { struct list_head list; struct err_info info; char loc[TRACING_LOG_LOC_MAX]; /* err location */ char cmd[MAX_FILTER_STR_VAL]; /* what caused err */ }; static DEFINE_MUTEX(tracing_err_log_lock); static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) { struct tracing_log_err *err; if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { err = kzalloc(sizeof(*err), GFP_KERNEL); if (!err) err = ERR_PTR(-ENOMEM); else tr->n_err_log_entries++; return err; } err = list_first_entry(&tr->err_log, struct tracing_log_err, list); list_del(&err->list); return err; } /** * err_pos - find the position of a string within a command for error careting * @cmd: The tracing command that caused the error * @str: The string to position the caret at within @cmd * * Finds the position of the first occurence of @str within @cmd. The * return value can be passed to tracing_log_err() for caret placement * within @cmd. * * Returns the index within @cmd of the first occurence of @str or 0 * if @str was not found. */ unsigned int err_pos(char *cmd, const char *str) { char *found; if (WARN_ON(!strlen(cmd))) return 0; found = strstr(cmd, str); if (found) return found - cmd; return 0; } /** * tracing_log_err - write an error to the tracing error log * @tr: The associated trace array for the error (NULL for top level array) * @loc: A string describing where the error occurred * @cmd: The tracing command that caused the error * @errs: The array of loc-specific static error strings * @type: The index into errs[], which produces the specific static err string * @pos: The position the caret should be placed in the cmd * * Writes an error into tracing/error_log of the form: * * <loc>: error: <text> * Command: <cmd> * ^ * * tracing/error_log is a small log file containing the last * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated * unless there has been a tracing error, and the error log can be * cleared and have its memory freed by writing the empty string in * truncation mode to it i.e. echo > tracing/error_log. * * NOTE: the @errs array along with the @type param are used to * produce a static error string - this string is not copied and saved * when the error is logged - only a pointer to it is saved. See * existing callers for examples of how static strings are typically * defined for use with tracing_log_err(). */ void tracing_log_err(struct trace_array *tr, const char *loc, const char *cmd, const char **errs, u8 type, u8 pos) { struct tracing_log_err *err; if (!tr) tr = &global_trace; mutex_lock(&tracing_err_log_lock); err = get_tracing_log_err(tr); if (PTR_ERR(err) == -ENOMEM) { mutex_unlock(&tracing_err_log_lock); return; } snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd); err->info.errs = errs; err->info.type = type; err->info.pos = pos; err->info.ts = local_clock(); list_add_tail(&err->list, &tr->err_log); mutex_unlock(&tracing_err_log_lock); } static void clear_tracing_err_log(struct trace_array *tr) { struct tracing_log_err *err, *next; mutex_lock(&tracing_err_log_lock); list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); kfree(err); } tr->n_err_log_entries = 0; mutex_unlock(&tracing_err_log_lock); } static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) { struct trace_array *tr = m->private; mutex_lock(&tracing_err_log_lock); return seq_list_start(&tr->err_log, *pos); } static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; return seq_list_next(v, &tr->err_log, pos); } static void tracing_err_log_seq_stop(struct seq_file *m, void *v) { mutex_unlock(&tracing_err_log_lock); } static void tracing_err_log_show_pos(struct seq_file *m, u8 pos) { u8 i; for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) seq_putc(m, ' '); for (i = 0; i < pos; i++) seq_putc(m, ' '); seq_puts(m, "^\n"); } static int tracing_err_log_seq_show(struct seq_file *m, void *v) { struct tracing_log_err *err = v; if (err) { const char *err_text = err->info.errs[err->info.type]; u64 sec = err->info.ts; u32 nsec; nsec = do_div(sec, NSEC_PER_SEC); seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000, err->loc, err_text); seq_printf(m, "%s", err->cmd); tracing_err_log_show_pos(m, err->info.pos); } return 0; } static const struct seq_operations tracing_err_log_seq_ops = { .start = tracing_err_log_seq_start, .next = tracing_err_log_seq_next, .stop = tracing_err_log_seq_stop, .show = tracing_err_log_seq_show }; static int tracing_err_log_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret = 0; ret = tracing_check_open_get_tr(tr); if (ret) return ret; /* If this file was opened for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) clear_tracing_err_log(tr); if (file->f_mode & FMODE_READ) { ret = seq_open(file, &tracing_err_log_seq_ops); if (!ret) { struct seq_file *m = file->private_data; m->private = tr; } else { trace_array_put(tr); } } return ret; } static ssize_t tracing_err_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { return count; } static int tracing_err_log_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); if (file->f_mode & FMODE_READ) seq_release(inode, file); return 0; } static const struct file_operations tracing_err_log_fops = { .open = tracing_err_log_open, .write = tracing_err_log_write, .read = seq_read, .llseek = tracing_lseek, .release = tracing_err_log_release, }; static int tracing_buffers_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct ftrace_buffer_info *info; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; info = kvzalloc(sizeof(*info), GFP_KERNEL); if (!info) { trace_array_put(tr); return -ENOMEM; } mutex_lock(&trace_types_lock); info->iter.tr = tr; info->iter.cpu_file = tracing_get_cpu(inode); info->iter.trace = tr->current_trace; info->iter.array_buffer = &tr->array_buffer; info->spare = NULL; /* Force reading ring buffer for first read */ info->read = (unsigned int)-1; filp->private_data = info; tr->trace_ref++; mutex_unlock(&trace_types_lock); ret = nonseekable_open(inode, filp); if (ret < 0) trace_array_put(tr); return ret; } static __poll_t tracing_buffers_poll(struct file *filp, poll_table *poll_table) { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; return trace_poll(iter, filp, poll_table); } static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; ssize_t ret = 0; ssize_t size; if (!count) return 0; #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->tr->current_trace->use_max_tr) return -EBUSY; #endif if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, iter->cpu_file); if (IS_ERR(info->spare)) { ret = PTR_ERR(info->spare); info->spare = NULL; } else { info->spare_cpu = iter->cpu_file; } } if (!info->spare) return ret; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) goto read; again: trace_access_lock(iter->cpu_file); ret = ring_buffer_read_page(iter->array_buffer->buffer, &info->spare, count, iter->cpu_file, 0); trace_access_unlock(iter->cpu_file); if (ret < 0) { if (trace_empty(iter)) { if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; ret = wait_on_pipe(iter, 0); if (ret) return ret; goto again; } return 0; } info->read = 0; read: size = PAGE_SIZE - info->read; if (size > count) size = count; ret = copy_to_user(ubuf, info->spare + info->read, size); if (ret == size) return -EFAULT; size -= ret; *ppos += size; info->read += size; return size; } static int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; mutex_lock(&trace_types_lock); iter->tr->trace_ref--; __trace_array_put(iter->tr); if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); kvfree(info); mutex_unlock(&trace_types_lock); return 0; } struct buffer_ref { struct trace_buffer *buffer; void *page; int cpu; refcount_t refcount; }; static void buffer_ref_release(struct buffer_ref *ref) { if (!refcount_dec_and_test(&ref->refcount)) return; ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); } static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; buffer_ref_release(ref); buf->private = 0; } static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; if (refcount_read(&ref->refcount) > INT_MAX/2) return false; refcount_inc(&ref->refcount); return true; } /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .release = buffer_pipe_buf_release, .get = buffer_pipe_buf_get, }; /* * Callback from splice_to_pipe(), if we need to release some pages * at the end of the spd in case we error'ed out in filling the pipe. */ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) { struct buffer_ref *ref = (struct buffer_ref *)spd->partial[i].private; buffer_ref_release(ref); spd->partial[i].private = 0; } static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct page *pages_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, .nr_pages_max = PIPE_DEF_BUFFERS, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; int entries, i; ssize_t ret = 0; #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->tr->current_trace->use_max_tr) return -EBUSY; #endif if (*ppos & (PAGE_SIZE - 1)) return -EINVAL; if (len & (PAGE_SIZE - 1)) { if (len < PAGE_SIZE) return -EINVAL; len &= PAGE_MASK; } if (splice_grow_spd(pipe, &spd)) return -ENOMEM; again: trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!ref) { ret = -ENOMEM; break; } refcount_set(&ref->refcount, 1); ref->buffer = iter->array_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (IS_ERR(ref->page)) { ret = PTR_ERR(ref->page); ref->page = NULL; kfree(ref); break; } ref->cpu = iter->cpu_file; r = ring_buffer_read_page(ref->buffer, &ref->page, len, iter->cpu_file, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); break; } page = virt_to_page(ref->page); spd.pages[i] = page; spd.partial[i].len = PAGE_SIZE; spd.partial[i].offset = 0; spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; *ppos += PAGE_SIZE; entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); } trace_access_unlock(iter->cpu_file); spd.nr_pages = i; /* did we read anything? */ if (!spd.nr_pages) { if (ret) goto out; ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent); if (ret) goto out; goto again; } ret = splice_to_pipe(pipe, &spd); out: splice_shrink_spd(&spd); return ret; } static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, .poll = tracing_buffers_poll, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, .llseek = no_llseek, }; static ssize_t tracing_stats_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(filp); struct trace_array *tr = inode->i_private; struct array_buffer *trace_buf = &tr->array_buffer; int cpu = tracing_get_cpu(inode); struct trace_seq *s; unsigned long cnt; unsigned long long t; unsigned long usec_rem; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "entries: %ld\n", cnt); cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "overrun: %ld\n", cnt); cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); if (trace_clocks[tr->clock_id].in_ns) { /* local or global for trace_clock */ t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); } else { /* counter or tsc mode for trace_clock */ trace_seq_printf(s, "oldest event ts: %llu\n", ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); trace_seq_printf(s, "now ts: %llu\n", ring_buffer_time_stamp(trace_buf->buffer, cpu)); } cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "dropped events: %ld\n", cnt); cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "read events: %ld\n", cnt); count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, trace_seq_used(s)); kfree(s); return count; } static const struct file_operations tracing_stats_fops = { .open = tracing_open_generic_tr, .read = tracing_stats_read, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; #ifdef CONFIG_DYNAMIC_FTRACE static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { ssize_t ret; char *buf; int r; /* 256 should be plenty to hold the amount needed */ buf = kmalloc(256, GFP_KERNEL); if (!buf) return -ENOMEM; r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n", ftrace_update_tot_cnt, ftrace_number_of_pages, ftrace_number_of_groups); ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); kfree(buf); return ret; } static const struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, .read = tracing_read_dyn_info, .llseek = generic_file_llseek, }; #endif /* CONFIG_DYNAMIC_FTRACE */ #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) static void ftrace_snapshot(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { tracing_snapshot_instance(tr); } static void ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; long *count = NULL; if (mapper) count = (long *)ftrace_func_mapper_find_ip(mapper, ip); if (count) { if (*count <= 0) return; (*count)--; } tracing_snapshot_instance(tr); } static int ftrace_snapshot_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; long *count = NULL; seq_printf(m, "%ps:", (void *)ip); seq_puts(m, "snapshot"); if (mapper) count = (long *)ftrace_func_mapper_find_ip(mapper, ip); if (count) seq_printf(m, ":count=%ld\n", *count); else seq_puts(m, ":unlimited\n"); return 0; } static int ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *init_data, void **data) { struct ftrace_func_mapper *mapper = *data; if (!mapper) { mapper = allocate_ftrace_func_mapper(); if (!mapper) return -ENOMEM; *data = mapper; } return ftrace_func_mapper_add_ip(mapper, ip, init_data); } static void ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *data) { struct ftrace_func_mapper *mapper = data; if (!ip) { if (!mapper) return; free_ftrace_func_mapper(mapper, NULL); return; } ftrace_func_mapper_remove_ip(mapper, ip); } static struct ftrace_probe_ops snapshot_probe_ops = { .func = ftrace_snapshot, .print = ftrace_snapshot_print, }; static struct ftrace_probe_ops snapshot_count_probe_ops = { .func = ftrace_count_snapshot, .print = ftrace_snapshot_print, .init = ftrace_snapshot_init, .free = ftrace_snapshot_free, }; static int ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; void *count = (void *)-1; char *number; int ret; if (!tr) return -ENODEV; /* hash funcs only work with set_ftrace_filter */ if (!enable) return -EINVAL; ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; if (glob[0] == '!') return unregister_ftrace_function_probe_func(glob+1, tr, ops); if (!param) goto out_reg; number = strsep(&param, ":"); if (!strlen(number)) goto out_reg; /* * We use the callback data field (which is a pointer) * as our counter. */ ret = kstrtoul(number, 0, (unsigned long *)&count); if (ret) return ret; out_reg: ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; ret = register_ftrace_function_probe(glob, tr, ops, count); out: return ret < 0 ? ret : 0; } static struct ftrace_func_command ftrace_snapshot_cmd = { .name = "snapshot", .func = ftrace_trace_snapshot_callback, }; static __init int register_snapshot_cmd(void) { return register_ftrace_command(&ftrace_snapshot_cmd); } #else static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ static struct dentry *tracing_get_dentry(struct trace_array *tr) { if (WARN_ON(!tr->dir)) return ERR_PTR(-ENODEV); /* Top directory uses NULL as the parent */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return NULL; /* All sub buffers have a descriptor */ return tr->dir; } static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) { struct dentry *d_tracer; if (tr->percpu_dir) return tr->percpu_dir; d_tracer = tracing_get_dentry(tr); if (IS_ERR(d_tracer)) return NULL; tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); MEM_FAIL(!tr->percpu_dir, "Could not create tracefs directory 'per_cpu/%d'\n", cpu); return tr->percpu_dir; } static struct dentry * trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, void *data, long cpu, const struct file_operations *fops) { struct dentry *ret = trace_create_file(name, mode, parent, data, fops); if (ret) /* See tracing_get_cpu() */ d_inode(ret)->i_cdev = (void *)(cpu + 1); return ret; } static void tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ if (!d_percpu) return; snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); return; } /* per cpu trace_pipe */ trace_create_cpu_file("trace_pipe", 0444, d_cpu, tr, cpu, &tracing_pipe_fops); /* per cpu trace */ trace_create_cpu_file("trace", 0644, d_cpu, tr, cpu, &tracing_fops); trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, tr, cpu, &tracing_buffers_fops); trace_create_cpu_file("stats", 0444, d_cpu, tr, cpu, &tracing_stats_fops); trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, tr, cpu, &tracing_entries_fops); #ifdef CONFIG_TRACER_SNAPSHOT trace_create_cpu_file("snapshot", 0644, d_cpu, tr, cpu, &snapshot_fops); trace_create_cpu_file("snapshot_raw", 0444, d_cpu, tr, cpu, &snapshot_raw_fops); #endif } #ifdef CONFIG_FTRACE_SELFTEST /* Let selftest have access to static functions in this file */ #include "trace_selftest.c" #endif static ssize_t trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_option_dentry *topt = filp->private_data; char *buf; if (topt->flags->val & topt->opt->bit) buf = "1\n"; else buf = "0\n"; return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); } static ssize_t trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_option_dentry *topt = filp->private_data; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (val != 0 && val != 1) return -EINVAL; if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) return ret; } *ppos += cnt; return cnt; } static int tracing_open_options(struct inode *inode, struct file *filp) { struct trace_option_dentry *topt = inode->i_private; int ret; ret = tracing_check_open_get_tr(topt->tr); if (ret) return ret; filp->private_data = inode->i_private; return 0; } static int tracing_release_options(struct inode *inode, struct file *file) { struct trace_option_dentry *topt = file->private_data; trace_array_put(topt->tr); return 0; } static const struct file_operations trace_options_fops = { .open = tracing_open_options, .read = trace_options_read, .write = trace_options_write, .llseek = generic_file_llseek, .release = tracing_release_options, }; /* * In order to pass in both the trace_array descriptor as well as the index * to the flag that the trace option file represents, the trace_array * has a character array of trace_flags_index[], which holds the index * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc. * The address of this character array is passed to the flag option file * read/write callbacks. * * In order to extract both the index and the trace_array descriptor, * get_tr_index() uses the following algorithm. * * idx = *ptr; * * As the pointer itself contains the address of the index (remember * index[1] == 1). * * Then to get the trace_array descriptor, by subtracting that index * from the ptr, we get to the start of the index itself. * * ptr - idx == &index[0] * * Then a simple container_of() from that pointer gets us to the * trace_array descriptor. */ static void get_tr_index(void *data, struct trace_array **ptr, unsigned int *pindex) { *pindex = *(unsigned char *)data; *ptr = container_of(data - *pindex, struct trace_array, trace_flags_index); } static ssize_t trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { void *tr_index = filp->private_data; struct trace_array *tr; unsigned int index; char *buf; get_tr_index(tr_index, &tr, &index); if (tr->trace_flags & (1 << index)) buf = "1\n"; else buf = "0\n"; return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); } static ssize_t trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { void *tr_index = filp->private_data; struct trace_array *tr; unsigned int index; unsigned long val; int ret; get_tr_index(tr_index, &tr, &index); ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (val != 0 && val != 1) return -EINVAL; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = set_tracer_flag(tr, 1 << index, val); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); if (ret < 0) return ret; *ppos += cnt; return cnt; } static const struct file_operations trace_options_core_fops = { .open = tracing_open_generic, .read = trace_options_core_read, .write = trace_options_core_write, .llseek = generic_file_llseek, }; struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { struct dentry *ret; ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) pr_warn("Could not create tracefs '%s' entry\n", name); return ret; } static struct dentry *trace_options_init_dentry(struct trace_array *tr) { struct dentry *d_tracer; if (tr->options) return tr->options; d_tracer = tracing_get_dentry(tr); if (IS_ERR(d_tracer)) return NULL; tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { pr_warn("Could not create tracefs directory 'options'\n"); return NULL; } return tr->options; } static void create_trace_option_file(struct trace_array *tr, struct trace_option_dentry *topt, struct tracer_flags *flags, struct tracer_opt *opt) { struct dentry *t_options; t_options = trace_options_init_dentry(tr); if (!t_options) return; topt->flags = flags; topt->opt = opt; topt->tr = tr; topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops); } static void create_trace_option_files(struct trace_array *tr, struct tracer *tracer) { struct trace_option_dentry *topts; struct trace_options *tr_topts; struct tracer_flags *flags; struct tracer_opt *opts; int cnt; int i; if (!tracer) return; flags = tracer->flags; if (!flags || !flags->opts) return; /* * If this is an instance, only create flags for tracers * the instance may have. */ if (!trace_ok_for_array(tracer, tr)) return; for (i = 0; i < tr->nr_topts; i++) { /* Make sure there's no duplicate flags. */ if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) return; } opts = flags->opts; for (cnt = 0; opts[cnt].name; cnt++) ; topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); if (!topts) return; tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), GFP_KERNEL); if (!tr_topts) { kfree(topts); return; } tr->topts = tr_topts; tr->topts[tr->nr_topts].tracer = tracer; tr->topts[tr->nr_topts].topts = topts; tr->nr_topts++; for (cnt = 0; opts[cnt].name; cnt++) { create_trace_option_file(tr, &topts[cnt], flags, &opts[cnt]); MEM_FAIL(topts[cnt].entry == NULL, "Failed to create trace option: %s", opts[cnt].name); } } static struct dentry * create_trace_option_core_file(struct trace_array *tr, const char *option, long index) { struct dentry *t_options; t_options = trace_options_init_dentry(tr); if (!t_options) return NULL; return trace_create_file(option, 0644, t_options, (void *)&tr->trace_flags_index[index], &trace_options_core_fops); } static void create_trace_options_dir(struct trace_array *tr) { struct dentry *t_options; bool top_level = tr == &global_trace; int i; t_options = trace_options_init_dentry(tr); if (!t_options) return; for (i = 0; trace_options[i]; i++) { if (top_level || !((1 << i) & TOP_LEVEL_TRACE_FLAGS)) create_trace_option_core_file(tr, trace_options[i], i); } } static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[64]; int r; r = tracer_tracing_is_on(tr); r = sprintf(buf, "%d\n", r); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; struct trace_buffer *buffer = tr->array_buffer.buffer; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (buffer) { mutex_lock(&trace_types_lock); if (!!val == tracer_tracing_is_on(tr)) { val = 0; /* do nothing */ } else if (val) { tracer_tracing_on(tr); if (tr->current_trace->start) tr->current_trace->start(tr); } else { tracer_tracing_off(tr); if (tr->current_trace->stop) tr->current_trace->stop(tr); } mutex_unlock(&trace_types_lock); } (*ppos)++; return cnt; } static const struct file_operations rb_simple_fops = { .open = tracing_open_generic_tr, .read = rb_simple_read, .write = rb_simple_write, .release = tracing_release_generic_tr, .llseek = default_llseek, }; static ssize_t buffer_percent_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[64]; int r; r = tr->buffer_percent; r = sprintf(buf, "%d\n", r); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t buffer_percent_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (val > 100) return -EINVAL; tr->buffer_percent = val; (*ppos)++; return cnt; } static const struct file_operations buffer_percent_fops = { .open = tracing_open_generic_tr, .read = buffer_percent_read, .write = buffer_percent_write, .release = tracing_release_generic_tr, .llseek = default_llseek, }; static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); static int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; buf->tr = tr; buf->buffer = ring_buffer_alloc(size, rb_flags); if (!buf->buffer) return -ENOMEM; buf->data = alloc_percpu(struct trace_array_cpu); if (!buf->data) { ring_buffer_free(buf->buffer); buf->buffer = NULL; return -ENOMEM; } /* Allocate the first page for all buffers */ set_buffer_entries(&tr->array_buffer, ring_buffer_size(tr->array_buffer.buffer, 0)); return 0; } static int allocate_trace_buffers(struct trace_array *tr, int size) { int ret; ret = allocate_trace_buffer(tr, &tr->array_buffer, size); if (ret) return ret; #ifdef CONFIG_TRACER_MAX_TRACE ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { ring_buffer_free(tr->array_buffer.buffer); tr->array_buffer.buffer = NULL; free_percpu(tr->array_buffer.data); tr->array_buffer.data = NULL; return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; /* * Only the top level trace array gets its snapshot allocated * from the kernel command line. */ allocate_snapshot = false; #endif return 0; } static void free_trace_buffer(struct array_buffer *buf) { if (buf->buffer) { ring_buffer_free(buf->buffer); buf->buffer = NULL; free_percpu(buf->data); buf->data = NULL; } } static void free_trace_buffers(struct trace_array *tr) { if (!tr) return; free_trace_buffer(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE free_trace_buffer(&tr->max_buffer); #endif } static void init_trace_flags_index(struct trace_array *tr) { int i; /* Used by the trace options files */ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) tr->trace_flags_index[i] = i; } static void __update_tracer_options(struct trace_array *tr) { struct tracer *t; for (t = trace_types; t; t = t->next) add_tracer_options(tr, t); } static void update_tracer_options(struct trace_array *tr) { mutex_lock(&trace_types_lock); tracer_options_updated = true; __update_tracer_options(tr); mutex_unlock(&trace_types_lock); } /* Must have trace_types_lock held */ struct trace_array *trace_array_find(const char *instance) { struct trace_array *tr, *found = NULL; list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr->name && strcmp(tr->name, instance) == 0) { found = tr; break; } } return found; } struct trace_array *trace_array_find_get(const char *instance) { struct trace_array *tr; mutex_lock(&trace_types_lock); tr = trace_array_find(instance); if (tr) tr->ref++; mutex_unlock(&trace_types_lock); return tr; } static int trace_array_create_dir(struct trace_array *tr) { int ret; tr->dir = tracefs_create_dir(tr->name, trace_instance_dir); if (!tr->dir) return -EINVAL; ret = event_trace_add_tracer(tr->dir, tr); if (ret) { tracefs_remove(tr->dir); return ret; } init_tracer_tracefs(tr, tr->dir); __update_tracer_options(tr); return ret; } static struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; int ret; ret = -ENOMEM; tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) return ERR_PTR(ret); tr->name = kstrdup(name, GFP_KERNEL); if (!tr->name) goto out_free_tr; if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) goto out_free_tr; tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&tr->start_lock); tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; tr->current_trace = &nop_trace; INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; ftrace_init_trace_array(tr); init_trace_flags_index(tr); if (trace_instance_dir) { ret = trace_array_create_dir(tr); if (ret) goto out_free_tr; } else __trace_early_add_events(tr); list_add(&tr->list, &ftrace_trace_arrays); tr->ref++; return tr; out_free_tr: ftrace_free_ftrace_ops(tr); free_trace_buffers(tr); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); return ERR_PTR(ret); } static int instance_mkdir(const char *name) { struct trace_array *tr; int ret; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = -EEXIST; if (trace_array_find(name)) goto out_unlock; tr = trace_array_create(name); ret = PTR_ERR_OR_ZERO(tr); out_unlock: mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); return ret; } /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. * @name: The name of the trace array to be looked up/created. * * Returns pointer to trace array with given name. * NULL, if it cannot be created. * * NOTE: This function increments the reference counter associated with the * trace array returned. This makes sure it cannot be freed while in use. * Use trace_array_put() once the trace array is no longer needed. * If the trace_array is to be freed, trace_array_destroy() needs to * be called after the trace_array_put(), or simply let user space delete * it from the tracefs instances directory. But until the * trace_array_put() is called, user space can not delete it. * */ struct trace_array *trace_array_get_by_name(const char *name) { struct trace_array *tr; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr->name && strcmp(tr->name, name) == 0) goto out_unlock; } tr = trace_array_create(name); if (IS_ERR(tr)) tr = NULL; out_unlock: if (tr) tr->ref++; mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); return tr; } EXPORT_SYMBOL_GPL(trace_array_get_by_name); static int __remove_instance(struct trace_array *tr) { int i; /* Reference counter for a newly created trace array = 1. */ if (tr->ref > 1 || (tr->current_trace && tr->trace_ref)) return -EBUSY; list_del(&tr->list); /* Disable all the flags that were enabled coming in */ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { if ((1 << i) & ZEROED_TRACE_FLAGS) set_tracer_flag(tr, 1 << i, 0); } tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); tracefs_remove(tr->dir); free_trace_buffers(tr); clear_tracing_err_log(tr); for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); } kfree(tr->topts); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); return 0; } int trace_array_destroy(struct trace_array *this_tr) { struct trace_array *tr; int ret; if (!this_tr) return -EINVAL; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = -ENODEV; /* Making sure trace array exists before destroying it. */ list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { ret = __remove_instance(tr); break; } } mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); return ret; } EXPORT_SYMBOL_GPL(trace_array_destroy); static int instance_rmdir(const char *name) { struct trace_array *tr; int ret; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = -ENODEV; tr = trace_array_find(name); if (tr) ret = __remove_instance(tr); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); return ret; } static __init void create_trace_instances(struct dentry *d_tracer) { struct trace_array *tr; trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, instance_mkdir, instance_rmdir); if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->name) continue; if (MEM_FAIL(trace_array_create_dir(tr) < 0, "Failed to create instance directory\n")) break; } mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); } static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { struct trace_event_file *file; int cpu; trace_create_file("available_tracers", 0444, d_tracer, tr, &show_traces_fops); trace_create_file("current_tracer", 0644, d_tracer, tr, &set_tracer_fops); trace_create_file("tracing_cpumask", 0644, d_tracer, tr, &tracing_cpumask_fops); trace_create_file("trace_options", 0644, d_tracer, tr, &tracing_iter_fops); trace_create_file("trace", 0644, d_tracer, tr, &tracing_fops); trace_create_file("trace_pipe", 0444, d_tracer, tr, &tracing_pipe_fops); trace_create_file("buffer_size_kb", 0644, d_tracer, tr, &tracing_entries_fops); trace_create_file("buffer_total_size_kb", 0444, d_tracer, tr, &tracing_total_entries_fops); trace_create_file("free_buffer", 0200, d_tracer, tr, &tracing_free_buffer_fops); trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); file = __find_event_file(tr, "ftrace", "print"); if (file && file->dir) trace_create_file("trigger", 0644, file->dir, file, &event_trigger_fops); tr->trace_marker_file = file; trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); trace_create_file("trace_clock", 0644, d_tracer, tr, &trace_clock_fops); trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); trace_create_file("timestamp_mode", 0444, d_tracer, tr, &trace_time_stamp_mode_fops); tr->buffer_percent = 50; trace_create_file("buffer_percent", 0444, d_tracer, tr, &buffer_percent_fops); create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) trace_create_maxlat_file(tr, d_tracer); #endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); #ifdef CONFIG_TRACER_SNAPSHOT trace_create_file("snapshot", 0644, d_tracer, tr, &snapshot_fops); #endif trace_create_file("error_log", 0644, d_tracer, tr, &tracing_err_log_fops); for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); ftrace_init_tracefs(tr, d_tracer); } static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; /* * To maintain backward compatibility for tools that mount * debugfs to get to the tracing facility, tracefs is automatically * mounted to the debugfs/tracing directory. */ type = get_fs_type("tracefs"); if (!type) return NULL; mnt = vfs_submount(mntpt, type, "tracefs", NULL); put_filesystem(type); if (IS_ERR(mnt)) return NULL; mntget(mnt); return mnt; } /** * tracing_init_dentry - initialize top level trace array * * This is called when creating files or directories in the tracing * directory. It is called via fs_initcall() by any of the boot up code * and expects to return the dentry of the top level tracing directory. */ int tracing_init_dentry(void) { struct trace_array *tr = &global_trace; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Tracing disabled due to lockdown\n"); return -EPERM; } /* The top level trace array uses NULL as parent */ if (tr->dir) return 0; if (WARN_ON(!tracefs_initialized())) return -ENODEV; /* * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount * the tracefs file system there, so older tools still * work with the newer kerenl. */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); return 0; } extern struct trace_eval_map *__start_ftrace_eval_maps[]; extern struct trace_eval_map *__stop_ftrace_eval_maps[]; static void __init trace_eval_init(void) { int len; len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); } #ifdef CONFIG_MODULES static void trace_module_add_evals(struct module *mod) { if (!mod->num_trace_evals) return; /* * Modules with bad taint do not have events created, do * not bother with enums either. */ if (trace_module_has_bad_taint(mod)) return; trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_EVAL_MAP_FILE static void trace_module_remove_evals(struct module *mod) { union trace_eval_map_item *map; union trace_eval_map_item **last = &trace_eval_maps; if (!mod->num_trace_evals) return; mutex_lock(&trace_eval_mutex); map = trace_eval_maps; while (map) { if (map->head.mod == mod) break; map = trace_eval_jmp_to_tail(map); last = &map->tail.next; map = map->tail.next; } if (!map) goto out; *last = trace_eval_jmp_to_tail(map)->tail.next; kfree(map); out: mutex_unlock(&trace_eval_mutex); } #else static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; switch (val) { case MODULE_STATE_COMING: trace_module_add_evals(mod); break; case MODULE_STATE_GOING: trace_module_remove_evals(mod); break; } return NOTIFY_OK; } static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, }; #endif /* CONFIG_MODULES */ static __init int tracer_init_tracefs(void) { int ret; trace_access_lock_init(); ret = tracing_init_dentry(); if (ret) return 0; event_trace_init(); init_tracer_tracefs(&global_trace, NULL); ftrace_init_tracefs_toplevel(&global_trace, NULL); trace_create_file("tracing_thresh", 0644, NULL, &global_trace, &tracing_thresh_fops); trace_create_file("README", 0444, NULL, NULL, &tracing_readme_fops); trace_create_file("saved_cmdlines", 0444, NULL, NULL, &tracing_saved_cmdlines_fops); trace_create_file("saved_cmdlines_size", 0644, NULL, NULL, &tracing_saved_cmdlines_size_fops); trace_create_file("saved_tgids", 0444, NULL, NULL, &tracing_saved_tgids_fops); trace_eval_init(); trace_create_eval_file(NULL); #ifdef CONFIG_MODULES register_module_notifier(&trace_module_nb); #endif #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, NULL, NULL, &tracing_dyn_info_fops); #endif create_trace_instances(NULL); update_tracer_options(&global_trace); return 0; } static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { if (ftrace_dump_on_oops) ftrace_dump(ftrace_dump_on_oops); return NOTIFY_OK; } static struct notifier_block trace_panic_notifier = { .notifier_call = trace_panic_handler, .next = NULL, .priority = 150 /* priority: INT_MAX >= x >= 0 */ }; static int trace_die_handler(struct notifier_block *self, unsigned long val, void *data) { switch (val) { case DIE_OOPS: if (ftrace_dump_on_oops) ftrace_dump(ftrace_dump_on_oops); break; default: break; } return NOTIFY_OK; } static struct notifier_block trace_die_notifier = { .notifier_call = trace_die_handler, .priority = 200 }; /* * printk is set to max of 1024, we really don't need it that big. * Nothing should be printing 1000 characters anyway. */ #define TRACE_MAX_PRINT 1000 /* * Define here KERN_TRACE so that we have one place to modify * it if we decide to change what log level the ftrace dump * should be at. */ #define KERN_TRACE KERN_EMERG void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ if (s->seq.len >= TRACE_MAX_PRINT) s->seq.len = TRACE_MAX_PRINT; /* * More paranoid code. Although the buffer size is set to * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just * an extra layer of protection. */ if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) s->seq.len = s->seq.size - 1; /* should be zero ended, but we are paranoid. */ s->buffer[s->seq.len] = 0; printk(KERN_TRACE "%s", s->buffer); trace_seq_init(s); } void trace_init_global_iter(struct trace_iterator *iter) { iter->tr = &global_trace; iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; iter->array_buffer = &global_trace.array_buffer; if (iter->trace && iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ if (ring_buffer_overruns(iter->array_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[iter->tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; /* Can not use kmalloc for iter.temp and iter.fmt */ iter->temp = static_temp_buf; iter->temp_size = STATIC_TEMP_BUF_SIZE; iter->fmt = static_fmt_buf; iter->fmt_size = STATIC_FMT_BUF_SIZE; } void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; static atomic_t dump_running; struct trace_array *tr = &global_trace; unsigned int old_userobj; unsigned long flags; int cnt = 0, cpu; /* Only allow one dump user at a time. */ if (atomic_inc_return(&dump_running) != 1) { atomic_dec(&dump_running); return; } /* * Always turn off tracing when we dump. * We don't need to show trace output of what happens * between multiple crashes. * * If the user does a sysrq-z, then they can re-enable * tracing with echo 1 > tracing_on. */ tracing_off(); local_irq_save(flags); printk_nmi_direct_enter(); /* Simulate the iterator */ trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; switch (oops_dump_mode) { case DUMP_ALL: iter.cpu_file = RING_BUFFER_ALL_CPUS; break; case DUMP_ORIG: iter.cpu_file = raw_smp_processor_id(); break; case DUMP_NONE: goto out_enable; default: printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); iter.cpu_file = RING_BUFFER_ALL_CPUS; } printk(KERN_TRACE "Dumping ftrace buffer:\n"); /* Did function tracer already get disabled? */ if (ftrace_is_dead()) { printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); printk("# MAY BE MISSING FUNCTION EVENTS\n"); } /* * We need to stop all tracing on all CPUS to read * the next buffer. This is a bit expensive, but is * not done often. We fill all what we can read, * and then release the locks again. */ while (!trace_empty(&iter)) { if (!cnt) printk(KERN_TRACE "---------------------------------\n"); cnt++; trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; if (trace_find_next_entry_inc(&iter) != NULL) { int ret; ret = print_trace_line(&iter); if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); } touch_nmi_watchdog(); trace_printk_seq(&iter.seq); } if (!cnt) printk(KERN_TRACE " (ftrace buffer empty)\n"); else printk(KERN_TRACE "---------------------------------\n"); out_enable: tr->trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } atomic_dec(&dump_running); printk_nmi_direct_exit(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ftrace_dump); int trace_run_command(const char *buf, int (*createfn)(int, char **)) { char **argv; int argc, ret; argc = 0; ret = 0; argv = argv_split(GFP_KERNEL, buf, &argc); if (!argv) return -ENOMEM; if (argc) ret = createfn(argc, argv); argv_free(argv); return ret; } #define WRITE_BUFSIZE 4096 ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(int, char **)) { char *kbuf, *buf, *tmp; int ret = 0; size_t done = 0; size_t size; kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); if (!kbuf) return -ENOMEM; while (done < count) { size = count - done; if (size >= WRITE_BUFSIZE) size = WRITE_BUFSIZE - 1; if (copy_from_user(kbuf, buffer + done, size)) { ret = -EFAULT; goto out; } kbuf[size] = '\0'; buf = kbuf; do { tmp = strchr(buf, '\n'); if (tmp) { *tmp = '\0'; size = tmp - buf + 1; } else { size = strlen(buf); if (done + size < count) { if (buf != kbuf) break; /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ pr_warn("Line length is too long: Should be less than %d\n", WRITE_BUFSIZE - 2); ret = -EINVAL; goto out; } } done += size; /* Remove comments */ tmp = strchr(buf, '#'); if (tmp) *tmp = '\0'; ret = trace_run_command(buf, createfn); if (ret) goto out; buf += size; } while (done < count); } ret = done; out: kfree(kbuf); return ret; } __init static int tracer_alloc_buffers(void) { int ring_buf_size; int ret = -ENOMEM; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Tracing disabled due to lockdown\n"); return -EPERM; } /* * Make sure we don't accidentally add more trace options * than we have bits for. */ BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; /* Only allocate trace_printk buffers if a trace_printk exists */ if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt) /* Must be called before global_trace.buffer is allocated */ trace_printk_init_buffers(); /* To save memory, keep the ring buffer size to its minimum */ if (ring_buffer_expanded) ring_buf_size = trace_buf_size; else ring_buf_size = 1; cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&global_trace.start_lock); /* * The prepare callbacks allocates some memory for the ring buffer. We * don't free the buffer if the CPU goes down. If we were to free * the buffer, then the user would lose any trace that was in the * buffer. The memory will be removed once the "instance" is removed. */ ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, "trace/RB:preapre", trace_rb_cpu_prepare, NULL); if (ret < 0) goto out_free_cpumask; /* Used for event triggers */ ret = -ENOMEM; temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) goto out_rm_hp_state; if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) goto out_free_savedcmd; /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); goto out_free_pipe_cpumask; } if (global_trace.buffer_disabled) tracing_off(); if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) pr_warn("Trace clock %s not defined, going back to default\n", trace_boot_clock); } /* * register_tracer() might reference current_trace, so it * needs to be set before we register anything. This is * just a bootstrap of current_trace anyway. */ global_trace.current_trace = &nop_trace; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; ftrace_init_global_array_ops(&global_trace); init_trace_flags_index(&global_trace); register_tracer(&nop_trace); /* Function tracing may start here (via kernel command line) */ init_function_trace(); /* All seems OK, enable tracing */ tracing_disabled = 0; atomic_notifier_chain_register(&panic_notifier_list, &trace_panic_notifier); register_die_notifier(&trace_die_notifier); global_trace.flags = TRACE_ARRAY_FL_GLOBAL; INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); INIT_LIST_HEAD(&global_trace.err_log); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); register_snapshot_cmd(); return 0; out_free_pipe_cpumask: free_cpumask_var(global_trace.pipe_cpumask); out_free_savedcmd: free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: ring_buffer_free(temp_buffer); out_rm_hp_state: cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE); out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); out: return ret; } void __init early_trace_init(void) { if (tracepoint_printk) { tracepoint_print_iter = kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); if (MEM_FAIL(!tracepoint_print_iter, "Failed to allocate trace iterator\n")) tracepoint_printk = 0; else static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); init_events(); } void __init trace_init(void) { trace_event_init(); } __init static int clear_boot_tracer(void) { /* * The default tracer at boot buffer is an init section. * This function is called in lateinit. If we did not * find the boot tracer, then clear it out, to prevent * later registration from accessing the buffer that is * about to be freed. */ if (!default_bootup_tracer) return 0; printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", default_bootup_tracer); default_bootup_tracer = NULL; return 0; } fs_initcall(tracer_init_tracefs); late_initcall_sync(clear_boot_tracer); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK __init static int tracing_set_default_clock(void) { /* sched_clock_stable() is determined in late_initcall */ if (!trace_boot_clock && !sched_clock_stable()) { if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Can not set tracing clock due to lockdown\n"); return -EPERM; } printk(KERN_WARNING "Unstable clock detected, switching default tracing clock to \"global\"\n" "If you want to keep using the local clock, then add:\n" " \"trace_clock=local\"\n" "on the kernel command line\n"); tracing_set_clock(&global_trace, "global"); } return 0; } late_initcall_sync(tracing_set_default_clock); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H /* * Define 'struct task_struct' and provide the main scheduler * APIs (schedule(), wakeup variants, etc.) */ #include <uapi/linux/sched.h> #include <asm/current.h> #include <linux/pid.h> #include <linux/sem.h> #include <linux/shm.h> #include <linux/kcov.h> #include <linux/mutex.h> #include <linux/plist.h> #include <linux/hrtimer.h> #include <linux/irqflags.h> #include <linux/seccomp.h> #include <linux/nodemask.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> #include <linux/mm_types_task.h> #include <linux/task_io_accounting.h> #include <linux/posix-timers.h> #include <linux/rseq.h> #include <linux/seqlock.h> #include <linux/kcsan.h> /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; struct backing_dev_info; struct bio_list; struct blk_plug; struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; struct robust_list_head; struct root_domain; struct rq; struct sched_attr; struct sched_param; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; struct io_uring_task; /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * * We have two separate sets of flags: task->state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ /* Used in tsk->state: */ #define TASK_RUNNING 0x0000 #define TASK_INTERRUPTIBLE 0x0001 #define TASK_UNINTERRUPTIBLE 0x0002 #define __TASK_STOPPED 0x0004 #define __TASK_TRACED 0x0008 /* Used in tsk->exit_state: */ #define EXIT_DEAD 0x0010 #define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ #define TASK_PARKED 0x0040 #define TASK_DEAD 0x0080 #define TASK_WAKEKILL 0x0100 #define TASK_WAKING 0x0200 #define TASK_NOLOAD 0x0400 #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) /* Convenience macros for the sake of wake_up(): */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) #define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) #define __set_current_state(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value));\ current->task_state_change = _THIS_IP_; \ current->state = (state_value); \ } while (0) #define set_current_state(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value));\ current->task_state_change = _THIS_IP_; \ smp_store_mb(current->state, (state_value)); \ } while (0) #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ WARN_ON_ONCE(!is_special_task_state(state_value)); \ raw_spin_lock_irqsave(&current->pi_lock, flags); \ current->task_state_change = _THIS_IP_; \ current->state = (state_value); \ raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ } while (0) #else /* * set_current_state() includes a barrier so that the write of current->state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); * if (CONDITION) * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before * accessing p->state. * * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * * However, with slightly different timing the wakeup TASK_RUNNING store can * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not * a problem either because that will result in one extra go around the loop * and our @cond test will save the day. * * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ current->state = (state_value) #define set_current_state(state_value) \ smp_store_mb(current->state, (state_value)) /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must * serialize against wakeups such that any possible in-flight TASK_RUNNING stores * will not collide with our state change. */ #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ raw_spin_lock_irqsave(&current->pi_lock, flags); \ current->state = (state_value); \ raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ } while (0) #endif /* Task command name length: */ #define TASK_COMM_LEN 16 extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern long schedule_timeout(long timeout); extern long schedule_timeout_interruptible(long timeout); extern long schedule_timeout_killable(long timeout); extern long schedule_timeout_uninterruptible(long timeout); extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode * @lock: protects the above two fields * * Stores previous user/system time values such that we can guarantee * monotonicity. */ struct prev_cputime { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE u64 utime; u64 stime; raw_spinlock_t lock; #endif }; enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, /* Task is idle */ VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, /* Task runs in userspace in a CPU with VTIME active: */ VTIME_USER, /* Task runs as guests in a CPU with VTIME active: */ VTIME_GUEST, }; struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; unsigned int cpu; u64 utime; u64 stime; u64 gtime; }; /* * Utilization clamp constraints. * @UCLAMP_MIN: Minimum utilization * @UCLAMP_MAX: Maximum utilization * @UCLAMP_CNT: Utilization clamp constraints count */ enum uclamp_id { UCLAMP_MIN = 0, UCLAMP_MAX, UCLAMP_CNT }; #ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; #endif struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ /* # of times we have run on this CPU: */ unsigned long pcount; /* Time spent waiting on a runqueue: */ unsigned long long run_delay; /* Timestamps: */ /* When did we last run on a CPU? */ unsigned long long last_arrival; /* When were we last queued to run? */ unsigned long long last_queued; #endif /* CONFIG_SCHED_INFO */ }; /* * Integer metrics need fixed point arithmetic, e.g., sched/fair * has a few: load, load_avg, util_avg, freq, and capacity. * * We define a basic fixed point arithmetic range, and then formalize * all these metrics based on that basic range. */ # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) /* Increase resolution of cpu_capacity calculations */ # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) struct load_weight { unsigned long weight; u32 inv_weight; }; /** * struct util_est - Estimation utilization of FAIR tasks * @enqueued: instantaneous estimated utilization of a task/cpu * @ewma: the Exponential Weighted Moving Average (EWMA) * utilization of a task * * Support data structure to track an Exponential Weighted Moving Average * (EWMA) of a FAIR task's utilization. New samples are added to the moving * average each time a task completes an activation. Sample's weight is chosen * so that the EWMA will be relatively insensitive to transient changes to the * task's workload. * * The enqueued attribute has a slightly different meaning for tasks and cpus: * - task: the task's util_avg at last task dequeue time * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU * Thus, the util_est.enqueued of a task represents the contribution on the * estimated utilization of the CPU where that task is currently enqueued. * * Only for tasks we track a moving average of the past instantaneous * estimated utilization. This allows to absorb sporadic drops in utilization * of an otherwise almost periodic task. * * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg * updates. When a task is dequeued, its util_est should not be updated if its * util_avg has not been updated in the meantime. * This information is mapped into the MSB bit of util_est.enqueued at dequeue * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg * for a task) it is safe to use MSB. */ struct util_est { unsigned int enqueued; unsigned int ewma; #define UTIL_EST_WEIGHT_SHIFT 2 #define UTIL_AVG_UNCHANGED 0x80000000 } __attribute__((__aligned__(sizeof(u64)))); /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * * [runnable_avg definition] * * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * * where runnable% is the time ratio that a sched_entity is runnable and * running% the time ratio that a sched_entity is running. * * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them * to as large a range as necessary. This is for example reflected by * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities * with the highest load (=88761), always runnable on a single cfs_rq, * and should not overflow as the number already hits PID_MAX_LIMIT. * * For all other cases (including 32-bit kernels), struct load_weight's * weight will overflow first before we do, because: * * Max(load_avg) <= Max(load.weight) * * Then it is the load_weight's responsibility to consider overflow * issues. */ struct sched_avg { u64 last_update_time; u64 load_sum; u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; struct util_est util_est; } ____cacheline_aligned; struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; u64 wait_count; u64 wait_sum; u64 iowait_count; u64 iowait_sum; u64 sleep_start; u64 sleep_max; s64 sum_sleep_runtime; u64 block_start; u64 block_max; u64 exec_max; u64 slice_max; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; u64 nr_wakeups_migrate; u64 nr_wakeups_local; u64 nr_wakeups_remote; u64 nr_wakeups_affine; u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; #endif }; struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; struct list_head group_node; unsigned int on_rq; u64 exec_start; u64 sum_exec_runtime; u64 vruntime; u64 prev_sum_exec_runtime; u64 nr_migrations; struct sched_statistics statistics; #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; /* cached value of my_q->h_nr_running */ unsigned long runnable_weight; #endif #ifdef CONFIG_SMP /* * Per entity load average tracking. * * Put into separate cache line so it does not * collide with read-mostly values above. */ struct sched_avg avg; #endif }; struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; unsigned short on_rq; unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq *rt_rq; /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif } __randomize_layout; struct sched_dl_entity { struct rb_node rb_node; /* * Original scheduling parameters. Copied here from sched_attr * during sched_setattr(), they will remain the same until * the next sched_setattr(). */ u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_period */ u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, * they are continuously updated during task execution. Note that * the remaining runtime could be < 0 in case we are in overrun. */ s64 runtime; /* Remaining runtime for this instance */ u64 deadline; /* Absolute deadline for this instance */ unsigned int flags; /* Specifying the scheduler behaviour */ /* * Some bool flags: * * @dl_throttled tells if we exhausted the runtime. If so, the * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * * @dl_non_contending tells if the task is inactive while still * contributing to the active utilization. In other words, it * indicates if the inactive timer has been armed and its handler * has not been executed yet. This flag is useful to avoid race * conditions between the inactive timer handler and the wakeup * code. * * @dl_overrun tells if the task asked to be informed about runtime * overruns. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; /* * Inactive timer, responsible for decreasing the active utilization * at the "0-lag time". When a -deadline task blocks, it contributes * to GRUB's active utilization until the "0-lag time", hence a * timer is needed to decrease the active utilization at the correct * time. */ struct hrtimer inactive_timer; #ifdef CONFIG_RT_MUTEXES /* * Priority Inheritance. When a DEADLINE scheduling entity is boosted * pi_se points to the donor, otherwise points to the dl_se it belongs * to (the original one/itself). */ struct sched_dl_entity *pi_se; #endif }; #ifdef CONFIG_UCLAMP_TASK /* Number of utilization clamp buckets (shorter alias) */ #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT /* * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. * * The active bit is set whenever a task has got an "effective" value assigned, * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. * * The user_defined bit is set whenever a task has got a task-specific clamp * value requested from userspace, i.e. the system defaults apply to this task * just as a restriction. This allows to relax default clamps when a less * restrictive task-specific value has been requested, thus allowing to * implement a "nice" semantic. For example, a task running with a 20% * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ union rcu_special { struct { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; enum perf_event_task_context { perf_invalid_context = -1, perf_hw_context = 0, perf_sw_context, perf_nr_task_contexts, }; struct wake_q_node { struct wake_q_node *next; }; struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif /* -1 unrunnable, 0 runnable, >0 stopped: */ volatile long state; /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ unsigned int cpu; #endif unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; #endif int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif struct sched_dl_entity dl; #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp_req[UCLAMP_CNT]; /* * Effective clamp values used for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t cpus_mask; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; union rcu_special trc_reader_special; bool trc_reader_checked; struct list_head trc_holdout_list; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; #endif struct mm_struct *mm; struct mm_struct *active_mm; /* Per-thread vma caching: */ struct vmacache vmacache; #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; #endif int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; #ifdef CONFIG_PSI unsigned sched_psi_wake_requeue:1; #endif /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* * This field must not be in the scheduler word above due to wakelist * queueing no longer being serialized by p->on_cpu. However: * * p->XXX = X; ttwu() * schedule() if (p->on_rq && ..) // false * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true * deactivate_task() ttwu_queue_wakelist()) * p->on_rq = 0; p->sched_remote_wakeup = Y; * * guarantees all stores of 'current' are visible before * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG unsigned in_user_fault:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; /* task is frozen/stopped (used by the cgroup freezer) */ unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP unsigned use_memdelay:1; #endif #ifdef CONFIG_PSI /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_group; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; /* PF_IO_WORKER */ void *pf_io_worker; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK struct posix_cputimers_work posix_cputimers_work; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; #ifdef CONFIG_KEYS /* Cached requested key. */ struct key *cached_requested_key; #endif /* * executable name, excluding path. * * - normally initialized setup_new_exec() * - access it with [gs]et_task_comm() * - lock it with task_lock() */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; #ifdef CONFIG_IO_URING struct io_uring_task *io_uring; #endif /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; #ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL struct audit_context *audit_context; #endif kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; /* Thread group tracking: */ u64 parent_exec_id; u64 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif #ifdef CONFIG_DEBUG_MUTEXES /* Mutex deadlock detection: */ struct mutex_waiter *blocked_on; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events irqtrace; unsigned int hardirq_threaded; u64 hardirq_chain_key; int softirqs_enabled; int softirq_context; int irq_config; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; #ifdef CONFIG_BLOCK /* Stack plugging: */ struct blk_plug *plug; #endif /* VM state: */ struct reclaim_state *reclaim_state; struct backing_dev_info *backing_dev_info; struct io_context *io_context; #ifdef CONFIG_COMPACTION struct capture_control *capture_control; #endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_PSI /* Pressure stall state */ unsigned int psi_flags; #endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Seqence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #endif #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; /* * This pointer is only modified for current in syscall and * pagefault context (and for tasks being destroyed), so it can be read * from any of the following contexts: * - RCU read-side critical section * - current->numa_group from everywhere * - task's runqueue locked, task not running */ struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ struct rseq __user *rseq; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically * with respect to preemption. */ unsigned long rseq_event_mask; #endif struct tlbflush_unmap_batch tlb_ubc; union { refcount_t rcu_users; struct rcu_head rcu; }; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #ifdef CONFIG_KASAN unsigned int kasan_depth; #endif #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif #endif #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; int curr_ret_depth; /* Stack of return addresses for return function tracing: */ struct ftrace_ret_stack *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* State flags for use by tracers: */ unsigned long trace; /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* See kernel/kcov.c for more details. */ /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; /* KCOV common handle for remote coverage collection: */ u64 kcov_handle; /* KCOV sequence number: */ int kcov_sequence; /* Collect coverage from softirq context: */ unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG struct mem_cgroup *memcg_in_oom; gfp_t memcg_oom_gfp_mask; int memcg_oom_order; /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; #endif #ifdef CONFIG_BLK_CGROUP struct request_queue *throttle_queue; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; #endif int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; unsigned long prev_lowest_stack; #endif #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; u64 mce_addr; __u64 mce_ripv : 1, mce_whole_page : 1, __mce_reserved : 62; struct callback_head mce_kill_me; int mce_count; #endif /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end /* CPU-specific state of this task: */ struct thread_struct thread; /* * WARNING: on x86, 'thread_struct' contains a variable-sized * structure. It *MUST* be at the end of 'task_struct'. * * Do not put anything below here! */ }; static inline struct pid *task_pid(struct task_struct *task) { return task->thread_pid; } /* * the helpers to get the task's different pids as they are seen * from various namespaces * * task_xid_nr() : global id, i.e. the id seen from the init namespace; * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of * current. * task_xid_nr_ns() : id seen from the ns specified; * * see also pid_nr() etc in include/linux/pid.h */ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); static inline pid_t task_pid_nr(struct task_struct *tsk) { return tsk->pid; } static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); } static inline pid_t task_pid_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); } static inline pid_t task_tgid_nr(struct task_struct *tsk) { return tsk->tgid; } /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. * * Test if a process is not yet dead (at most zombie state) * If pid_alive fails, then pointers within the task structure * can be stale and must not be dereferenced. * * Return: 1 if the process is alive. 0 otherwise. */ static inline int pid_alive(const struct task_struct *p) { return p->thread_pid != NULL; } static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); } static inline pid_t task_pgrp_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); } static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); } static inline pid_t task_session_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); } static inline pid_t task_tgid_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); } static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) { pid_t pid = 0; rcu_read_lock(); if (pid_alive(tsk)) pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); rcu_read_unlock(); return pid; } static inline pid_t task_ppid_nr(const struct task_struct *tsk) { return task_ppid_nr_ns(tsk, &init_pid_ns); } /* Obsolete, do not use: */ static inline pid_t task_pgrp_nr(struct task_struct *tsk) { return task_pgrp_nr_ns(tsk, &init_pid_ns); } #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) static inline unsigned int task_state_index(struct task_struct *tsk) { unsigned int tsk_state = READ_ONCE(tsk->state); unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); if (tsk_state == TASK_IDLE) state = TASK_REPORT_IDLE; return fls(state); } static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); return state_char[state]; } static inline char task_state_to_char(struct task_struct *tsk) { return task_index_to_char(task_state_index(tsk)); } /** * is_global_init - check if a task structure is init. Since init * is free to have sub-threads we need to check tgid. * @tsk: Task structure to be checked. * * Check if a task structure is the first user space task the kernel created. * * Return: 1 if the task structure is init. 0 otherwise. */ static inline int is_global_init(struct task_struct *tsk) { return task_tgid_nr(tsk) == 1; } extern struct pid *cad_pid; /* * Per process flags */ #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example * with tsk_used_math (like during threaded core dumping). * There is however an exception to this rule during ptrace * or during fork: the ptracer task is allowed to write to the * child->flags of its traced child (same goes for fork, the parent * can write to the child->flags), because we're guaranteed the * child is not running and in turn not changing child->flags * at the same time the parent does it. */ #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) #define clear_used_math() clear_stopped_child_used_math(current) #define set_used_math() set_stopped_child_used_math(current) #define conditional_stopped_child_used_math(condition, child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) #define copy_to_stopped_child_used_math(child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) static __always_inline bool is_percpu_thread(void) { #ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); #else return true; #endif } /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ { return test_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_SET(name, func) \ static inline void task_set_##func(struct task_struct *p) \ { set_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_CLEAR(name, func) \ static inline void task_clear_##func(struct task_struct *p) \ { clear_bit(PFA_##name, &p->atomic_flags); } TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) TASK_PFA_TEST(SPREAD_PAGE, spread_page) TASK_PFA_SET(SPREAD_PAGE, spread_page) TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { current->flags &= ~flags; current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); #ifdef CONFIG_SMP extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); #else static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { } static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { if (!cpumask_test_cpu(0, new_mask)) return -EINVAL; return 0; } #endif extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** * task_nice - return the nice value of a given task. * @p: the task in question. * * Return: The nice value [ -20 ... 0 ... 19 ]. */ static inline int task_nice(const struct task_struct *p) { return PRIO_TO_NICE((p)->static_prio); } extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? * @p: the task in question. * * Return: 1 if @p is an idle task. 0 otherwise. */ static __always_inline bool is_idle_task(const struct task_struct *p) { return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { #ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK struct task_struct task; #endif #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; #ifndef CONFIG_THREAD_INFO_IN_TASK extern struct thread_info init_thread_info; #endif extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK static inline struct thread_info *task_thread_info(struct task_struct *task) { return &task->thread_info; } #elif !defined(__HAVE_THREAD_FUNCTIONS) # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif /* * find a task by one of its numerical ids * * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): * finds a task by its virtual pid * * see also find_vpid() etc in include/linux/pid.h */ extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); /* * find a task by its virtual pid and get the task struct */ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); static inline void set_task_comm(struct task_struct *tsk, const char *from) { __set_task_comm(tsk, from, false); } extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); #define get_task_comm(buf, tsk) ({ \ BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ __get_task_comm(buf, sizeof(buf), tsk); \ }) #ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ preempt_fold_need_resched(); } extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else static inline void scheduler_ipi(void) { } static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { return 1; } #endif /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: */ static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { set_ti_thread_flag(task_thread_info(tsk), flag); } static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) { clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, bool value) { update_ti_thread_flag(task_thread_info(tsk), flag, value); } static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); } static inline void set_tsk_need_resched(struct task_struct *tsk) { set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline int test_tsk_need_resched(struct task_struct *tsk) { return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ #ifndef CONFIG_PREEMPTION extern int _cond_resched(void); #else static inline int _cond_resched(void) { return 0; } #endif #define cond_resched() ({ \ ___might_sleep(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); #define cond_resched_lock(lock) ({ \ ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ __cond_resched_lock(lock); \ }) static inline void cond_resched_rcu(void) { #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) rcu_read_unlock(); cond_resched(); rcu_read_lock(); #endif } /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPTION, * but a general need for low latency) */ static inline int spin_needbreak(spinlock_t *lock) { #ifdef CONFIG_PREEMPTION return spin_is_contended(lock); #else return 0; #endif } static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); } /* * Wrappers for p->thread_info->cpu access. No-op on UP. */ #ifdef CONFIG_SMP static inline unsigned int task_cpu(const struct task_struct *p) { #ifdef CONFIG_THREAD_INFO_IN_TASK return READ_ONCE(p->cpu); #else return READ_ONCE(task_thread_info(p)->cpu); #endif } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else static inline unsigned int task_cpu(const struct task_struct *p) { return 0; } static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_SMP */ /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. * * This allows us to terminate optimistic spin loops and block, analogous to * the native optimistic spin heuristic of testing if the lock owner task is * running or not. */ #ifndef vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) { return false; } #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #ifndef TASK_SIZE_OF #define TASK_SIZE_OF(tsk) TASK_SIZE #endif #ifdef CONFIG_RSEQ /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. */ enum rseq_event_mask_bits { RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, }; enum rseq_event_mask { RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), }; static inline void rseq_set_notify_resume(struct task_struct *t) { if (t->rseq) set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { if (current->rseq) __rseq_handle_notify_resume(ksig, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { preempt_disable(); __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask); preempt_enable(); rseq_handle_notify_resume(ksig, regs); } /* rseq_preempt() requires preemption to be disabled. */ static inline void rseq_preempt(struct task_struct *t) { __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* rseq_migrate() requires preemption to be disabled. */ static inline void rseq_migrate(struct task_struct *t) { __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; t->rseq_sig = 0; t->rseq_event_mask = 0; } else { t->rseq = current->rseq; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; } } static inline void rseq_execve(struct task_struct *t) { t->rseq = NULL; t->rseq_sig = 0; t->rseq_event_mask = 0; } #else static inline void rseq_set_notify_resume(struct task_struct *t) { } static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_preempt(struct task_struct *t) { } static inline void rseq_migrate(struct task_struct *t) { } static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } #endif #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); #else static inline void rseq_syscall(struct pt_regs *regs) { } #endif const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq); char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len); int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq); const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq); const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); int sched_trace_rq_cpu(struct rq *rq); int sched_trace_rq_cpu_capacity(struct rq *rq); int sched_trace_rq_nr_running(struct rq *rq); const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 /* SPDX-License-Identifier: GPL-2.0 */ /* * descriptor table internals; you almost certainly want file.h instead. */ #ifndef __LINUX_FDTABLE_H #define __LINUX_FDTABLE_H #include <linux/posix_types.h> #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/nospec.h> #include <linux/types.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/atomic.h> /* * The default fd array needs to be at least BITS_PER_LONG, * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG #define NR_OPEN_MAX ~0U struct fdtable { unsigned int max_fds; struct file __rcu **fd; /* current fd array */ unsigned long *close_on_exec; unsigned long *open_fds; unsigned long *full_fds_bits; struct rcu_head rcu; }; static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->close_on_exec); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->open_fds); } /* * Open file table structure */ struct files_struct { /* * read mostly part */ atomic_t count; bool resize_in_progress; wait_queue_head_t resize_wait; struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP */ spinlock_t file_lock ____cacheline_aligned_in_smp; unsigned int next_fd; unsigned long close_on_exec_init[1]; unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; struct file_operations; struct vfsmount; struct dentry; #define rcu_dereference_check_fdtable(files, fdtfd) \ rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock)) #define files_fdtable(files) \ rcu_dereference_check_fdtable((files), (files)->fdt) /* * The caller must ensure that fd table isn't shared or hold rcu or file lock */ static inline struct file *__fcheck_files(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = rcu_dereference_raw(files->fdt); if (fd < fdt->max_fds) { fd = array_index_nospec(fd, fdt->max_fds); return rcu_dereference_raw(fdt->fd[fd]); } return NULL; } static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd) { RCU_LOCKDEP_WARN(!rcu_read_lock_held() && !lockdep_is_held(&files->file_lock), "suspicious rcu_dereference_check() usage"); return __fcheck_files(files, fd); } /* * Check whether the specified fd has an open file. */ #define fcheck(fd) fcheck_files(current->files, fd) struct task_struct; struct files_struct *get_files_struct(struct task_struct *); void put_files_struct(struct files_struct *fs); void reset_files_struct(struct files_struct *); int unshare_files(struct files_struct **); struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), const void *); extern int __alloc_fd(struct files_struct *files, unsigned start, unsigned end, unsigned flags); extern void __fd_install(struct files_struct *files, unsigned int fd, struct file *file); extern int __close_fd(struct files_struct *files, unsigned int fd); extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); extern int close_fd_get_file(unsigned int fd, struct file **res); extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, struct files_struct **new_fdp); extern struct kmem_cache *files_cachep; #endif /* __LINUX_FDTABLE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 /* SPDX-License-Identifier: GPL-2.0 */ /* * workqueue.h --- work queue handling for Linux. */ #ifndef _LINUX_WORKQUEUE_H #define _LINUX_WORKQUEUE_H #include <linux/timer.h> #include <linux/linkage.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/threads.h> #include <linux/atomic.h> #include <linux/cpumask.h> #include <linux/rcupdate.h> struct workqueue_struct; struct work_struct; typedef void (*work_func_t)(struct work_struct *work); void delayed_work_timer_fn(struct timer_list *t); /* * The first word is the work queue pointer and the flags rolled into * one */ #define work_data_bits(work) ((unsigned long *)(&(work)->data)) enum { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ WORK_STRUCT_INACTIVE_BIT= 1, /* work item is inactive */ WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC_BIT = 4, /* static initializer (debugobjects) */ WORK_STRUCT_COLOR_SHIFT = 5, /* color for workqueue flushing */ #else WORK_STRUCT_COLOR_SHIFT = 4, /* color for workqueue flushing */ #endif WORK_STRUCT_COLOR_BITS = 4, WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, #else WORK_STRUCT_STATIC = 0, #endif /* * The last color is no color used for works which don't * participate in workqueue flushing. */ WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS) - 1, WORK_NO_COLOR = WORK_NR_COLORS, /* not bound to any CPU, prefer the local CPU */ WORK_CPU_UNBOUND = NR_CPUS, /* * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. * This makes pwqs aligned to 256 bytes and allows 15 workqueue * flush colors. */ WORK_STRUCT_FLAG_BITS = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, /* data contains off-queue information when !WORK_STRUCT_PWQ */ WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, /* * When a work item is off queue, its high bits point to the last * pool it was on. Cap at 31 bits and use the highest number to * indicate that no pool is associated. */ WORK_OFFQ_FLAG_BITS = 1, WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, WORK_BUSY_RUNNING = 1 << 1, /* maximum string length for set_worker_desc() */ WORKER_DESC_LEN = 24, }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ #define WORK_OFFQ_CANCELING (1ul << __WORK_OFFQ_CANCELING) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) #define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) #define WORK_STRUCT_FLAG_MASK ((1ul << WORK_STRUCT_FLAG_BITS) - 1) #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) struct work_struct { atomic_long_t data; struct list_head entry; work_func_t func; #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif }; #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)) struct delayed_work { struct work_struct work; struct timer_list timer; /* target workqueue and CPU ->timer uses to queue ->work */ struct workqueue_struct *wq; int cpu; }; struct rcu_work { struct work_struct work; struct rcu_head rcu; /* target workqueue ->rcu uses to queue ->work */ struct workqueue_struct *wq; }; /** * struct workqueue_attrs - A struct for workqueue attributes. * * This can be used to change attributes of an unbound workqueue. */ struct workqueue_attrs { /** * @nice: nice level */ int nice; /** * @cpumask: allowed CPUs */ cpumask_var_t cpumask; /** * @no_numa: disable NUMA affinity * * Unlike other fields, ``no_numa`` isn't a property of a worker_pool. It * only modifies how :c:func:`apply_workqueue_attrs` select pools and thus * doesn't participate in pool hash calculations or equality comparisons. */ bool no_numa; }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) { return container_of(work, struct delayed_work, work); } static inline struct rcu_work *to_rcu_work(struct work_struct *work) { return container_of(work, struct rcu_work, work); } struct execute_work { struct work_struct work; }; #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting _key * here is required, otherwise it could get initialised to the * copy of the lockdep_map! */ #define __WORK_INIT_LOCKDEP_MAP(n, k) \ .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k), #else #define __WORK_INIT_LOCKDEP_MAP(n, k) #endif #define __WORK_INITIALIZER(n, f) { \ .data = WORK_DATA_STATIC_INIT(), \ .entry = { &(n).entry, &(n).entry }, \ .func = (f), \ __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ } #define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \ .work = __WORK_INITIALIZER((n).work, (f)), \ .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\ (tflags) | TIMER_IRQSAFE), \ } #define DECLARE_WORK(n, f) \ struct work_struct n = __WORK_INITIALIZER(n, f) #define DECLARE_DELAYED_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0) #define DECLARE_DEFERRABLE_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE) #ifdef CONFIG_DEBUG_OBJECTS_WORK extern void __init_work(struct work_struct *work, int onstack); extern void destroy_work_on_stack(struct work_struct *work); extern void destroy_delayed_work_on_stack(struct delayed_work *work); static inline unsigned int work_static(struct work_struct *work) { return *work_data_bits(work) & WORK_STRUCT_STATIC; } #else static inline void __init_work(struct work_struct *work, int onstack) { } static inline void destroy_work_on_stack(struct work_struct *work) { } static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { } static inline unsigned int work_static(struct work_struct *work) { return 0; } #endif /* * initialize all of a work item in one go * * NOTE! No point in using "atomic_long_set()": using a direct * assignment of the work data initializer allows the compiler * to generate better code. */ #ifdef CONFIG_LOCKDEP #define __INIT_WORK(_work, _func, _onstack) \ do { \ static struct lock_class_key __key; \ \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #else #define __INIT_WORK(_work, _func, _onstack) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #endif #define INIT_WORK(_work, _func) \ __INIT_WORK((_work), (_func), 0) #define INIT_WORK_ONSTACK(_work, _func) \ __INIT_WORK((_work), (_func), 1) #define __INIT_DELAYED_WORK(_work, _func, _tflags) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ __init_timer(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags) \ do { \ INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ __init_timer_on_stack(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define INIT_DELAYED_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, 0) #define INIT_DELAYED_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0) #define INIT_DEFERRABLE_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE) #define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE) #define INIT_RCU_WORK(_work, _func) \ INIT_WORK(&(_work)->work, (_func)) #define INIT_RCU_WORK_ONSTACK(_work, _func) \ INIT_WORK_ONSTACK(&(_work)->work, (_func)) /** * work_pending - Find out whether a work item is currently pending * @work: The work item in question */ #define work_pending(work) \ test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) /** * delayed_work_pending - Find out whether a delayable work item is currently * pending * @w: The work item in question */ #define delayed_work_pending(w) \ work_pending(&(w)->work) /* * Workqueue flags and constants. For details, please refer to * Documentation/core-api/workqueue.rst. */ enum { WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu intensive workqueue */ WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */ /* * Per-cpu workqueues are generally preferred because they tend to * show better performance thanks to cache locality. Per-cpu * workqueues exclude the scheduler from choosing the CPU to * execute the worker threads, which has an unfortunate side effect * of increasing power consumption. * * The scheduler considers a CPU idle if it doesn't have any task * to execute and tries to keep idle cores idle to conserve power; * however, for example, a per-cpu work item scheduled from an * interrupt handler on an idle CPU will force the scheduler to * excute the work item on that CPU breaking the idleness, which in * turn may lead to more scheduling choices which are sub-optimal * in terms of power consumption. * * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default * but become unbound if workqueue.power_efficient kernel param is * specified. Per-cpu workqueues which are identified to * contribute significantly to power-consumption are identified and * marked with this flag and enabling the power_efficient mode * leads to noticeable power saving at the cost of small * performance disadvantage. * * http://thread.gmane.org/gmane.linux.kernel/1480396 */ WQ_POWER_EFFICIENT = 1 << 7, __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, }; /* unbound wq's aren't per-cpu, scale max_active according to #cpus */ #define WQ_UNBOUND_MAX_ACTIVE \ max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU) /* * System-wide workqueues which are always present. * * system_wq is the one used by schedule[_delayed]_work[_on](). * Multi-CPU multi-threaded. There are users which expect relatively * short queue flush time. Don't queue works which can run for too * long. * * system_highpri_wq is similar to system_wq but for work items which * require WQ_HIGHPRI. * * system_long_wq is similar to system_wq but may host long running * works. Queue flushing might take relatively long. * * system_unbound_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and * resources are available. * * system_freezable_wq is equivalent to system_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. * system_power_efficient_wq is identical to system_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_highpri_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; /** * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags * @max_active: max in-flight work items, 0 for default * remaining args: args for @fmt * * Allocate a workqueue with the specified parameters. For detailed * information on WQ_* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); /** * alloc_ordered_workqueue - allocate an ordered workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) * @args...: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are * implemented as unbound workqueues with @max_active of one. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \ __WQ_ORDERED_EXPLICIT | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) #define create_singlethread_workqueue(name) \ alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_work_node(int node, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork); extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); extern bool cancel_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern bool flush_rcu_work(struct rcu_work *rwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); extern struct work_struct *current_work(void); extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); extern void show_workqueue_state(void); extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task); /** * queue_work - queue work on a workqueue * @wq: workqueue to use * @work: work to queue * * Returns %false if @work was already on a queue, %true otherwise. * * We queue the work to the CPU on which it was submitted, but if the CPU dies * it can be processed by another CPU. * * Memory-ordering properties: If it returns %true, guarantees that all stores * preceding the call to queue_work() in the program order will be visible from * the CPU which will execute @work by the time such work executes, e.g., * * { x is initially 0 } * * CPU0 CPU1 * * WRITE_ONCE(x, 1); [ @work is being executed ] * r0 = queue_work(wq, work); r1 = READ_ONCE(x); * * Forbids: r0 == true && r1 == 0 */ static inline bool queue_work(struct workqueue_struct *wq, struct work_struct *work) { return queue_work_on(WORK_CPU_UNBOUND, wq, work); } /** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use * @dwork: delayable work to queue * @delay: number of jiffies to wait before queueing * * Equivalent to queue_delayed_work_on() but tries to use the local CPU. */ static inline bool queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * mod_delayed_work - modify delay of or queue a delayed work * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * mod_delayed_work_on() on local CPU. */ static inline bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); } /** * schedule_work - put work task in global workqueue * @work: job to be done * * Returns %false if @work was already on the kernel-global workqueue and * %true otherwise. * * This puts a job in the kernel-global workqueue if it was not already * queued and leaves it in the same position on the kernel-global * workqueue otherwise. * * Shares the same memory-ordering properties of queue_work(), cf. the * DocBook header of queue_work(). */ static inline bool schedule_work(struct work_struct *work) { return queue_work(system_wq, work); } /** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its * completion. * * Think twice before calling this function! It's very easy to get into * trouble if you don't take great care. Either of the following situations * will lead to deadlock: * * One of the work items currently on the workqueue needs to acquire * a lock held by your code or its caller. * * Your code is running in the context of a work routine. * * They will be detected by lockdep when they occur, but the first might not * occur very often. It depends on what work items are on the workqueue and * what locks they need, which you have no control over. * * In most situations flushing the entire workqueue is overkill; you merely * need to know that a particular work item isn't queued and isn't running. * In such cases you should use cancel_delayed_work_sync() or * cancel_work_sync() instead. */ static inline void flush_scheduled_work(void) { flush_workqueue(system_wq); } /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done * @delay: number of jiffies to wait * * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(cpu, system_wq, dwork, delay); } /** * schedule_delayed_work - put work task in global workqueue after delay * @dwork: job to be done * @delay: number of jiffies to wait or 0 for immediate execution * * After waiting for a given time this puts a job in the kernel-global * workqueue. */ static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work(system_wq, dwork, delay); } #ifndef CONFIG_SMP static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } #else long work_on_cpu(int cpu, long (*fn)(void *), void *arg); long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER extern void freeze_workqueues_begin(void); extern bool freeze_workqueues_busy(void); extern void thaw_workqueues(void); #endif /* CONFIG_FREEZER */ #ifdef CONFIG_SYSFS int workqueue_sysfs_register(struct workqueue_struct *wq); #else /* CONFIG_SYSFS */ static inline int workqueue_sysfs_register(struct workqueue_struct *wq) { return 0; } #endif /* CONFIG_SYSFS */ #ifdef CONFIG_WQ_WATCHDOG void wq_watchdog_touch(int cpu); #else /* CONFIG_WQ_WATCHDOG */ static inline void wq_watchdog_touch(int cpu) { } #endif /* CONFIG_WQ_WATCHDOG */ #ifdef CONFIG_SMP int workqueue_prepare_cpu(unsigned int cpu); int workqueue_online_cpu(unsigned int cpu); int workqueue_offline_cpu(unsigned int cpu); #endif void __init workqueue_init_early(void); void __init workqueue_init(void); #endif
6 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 /* SPDX-License-Identifier: GPL-2.0 */ /* * This header provides generic wrappers for memory access instrumentation that * the compiler cannot emit for: KASAN, KCSAN. */ #ifndef _LINUX_INSTRUMENTED_H #define _LINUX_INSTRUMENTED_H #include <linux/compiler.h> #include <linux/kasan-checks.h> #include <linux/kcsan-checks.h> #include <linux/types.h> /** * instrument_read - instrument regular read access * * Instrument a regular read access. The instrumentation should be inserted * before the actual read happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_read(const volatile void *v, size_t size) { kasan_check_read(v, size); kcsan_check_read(v, size); } /** * instrument_write - instrument regular write access * * Instrument a regular write access. The instrumentation should be inserted * before the actual write happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_write(v, size); } /** * instrument_read_write - instrument regular read-write access * * Instrument a regular write access. The instrumentation should be inserted * before the actual write happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_read_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_read_write(v, size); } /** * instrument_atomic_read - instrument atomic read access * * Instrument an atomic read access. The instrumentation should be inserted * before the actual read happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_atomic_read(const volatile void *v, size_t size) { kasan_check_read(v, size); kcsan_check_atomic_read(v, size); } /** * instrument_atomic_write - instrument atomic write access * * Instrument an atomic write access. The instrumentation should be inserted * before the actual write happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_atomic_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_atomic_write(v, size); } /** * instrument_atomic_read_write - instrument atomic read-write access * * Instrument an atomic read-write access. The instrumentation should be * inserted before the actual write happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_atomic_read_write(v, size); } /** * instrument_copy_to_user - instrument reads of copy_to_user * * Instrument reads from kernel memory, that are due to copy_to_user (and * variants). The instrumentation must be inserted before the accesses. * * @to destination address * @from source address * @n number of bytes to copy */ static __always_inline void instrument_copy_to_user(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); kcsan_check_read(from, n); } /** * instrument_copy_from_user - instrument writes of copy_from_user * * Instrument writes to kernel memory, that are due to copy_from_user (and * variants). The instrumentation should be inserted before the accesses. * * @to destination address * @from source address * @n number of bytes to copy */ static __always_inline void instrument_copy_from_user(const void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); kcsan_check_write(to, n); } #endif /* _LINUX_INSTRUMENTED_H */
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes * * Manage the dynamic fd arrays in the process files_struct. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> #include <net/sock.h> #include "internal.h" unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } static void free_fdtable_rcu(struct rcu_head *rcu) { __free_fdtable(container_of(rcu, struct fdtable, rcu)); } #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, unsigned int count) { unsigned int cpy, set; cpy = count / BITS_PER_BYTE; set = (nfdt->max_fds - count) / BITS_PER_BYTE; memcpy(nfdt->open_fds, ofdt->open_fds, cpy); memset((char *)nfdt->open_fds + cpy, 0, set); memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); memset((char *)nfdt->close_on_exec + cpy, 0, set); cpy = BITBIT_SIZE(count); set = BITBIT_SIZE(nfdt->max_fds) - cpy; memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); memset((char *)nfdt->full_fds_bits + cpy, 0, set); } /* * Copy all file descriptors from the old table to the new, expanded table and * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); } /* * Note how the fdtable bitmap allocations very much have to be a multiple of * BITS_PER_LONG. This is not only because we walk those things in chunks of * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". * * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied * by that "1024/sizeof(ptr)" before, we already know there are sufficient * clear low bits. Clang seems to realize that, gcc ends up being confused. * * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, * let's consider it documentation (and maybe a test-case for gcc to improve * its code generation ;) */ static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; void *data; /* * Figure out how many fds we actually want to support in this fdtable. * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B * and growing in powers of two from there on. */ nr /= (1024 / sizeof(struct file *)); nr = roundup_pow_of_two(nr + 1); nr *= (1024 / sizeof(struct file *)); nr = ALIGN(nr, BITS_PER_LONG); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. Deal * with that in caller, it's cheaper that way. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ if (unlikely(nr > sysctl_nr_open)) nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; data = kvmalloc(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; return fdt; out_arr: kvfree(fdt->fd); out_fdt: kfree(fdt); out: return NULL; } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. * Return <0 error code on error; 1 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); /* make sure all __fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) synchronize_rcu(); spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; /* * extremely unlikely race - sysctl_nr_open decreased between the check in * caller and alloc_fdtable(). Cheaper to catch it here... */ if (unlikely(new_fdt->max_fds <= nr)) { __free_fdtable(new_fdt); return -EMFILE; } cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in __fd_install() */ smp_wmb(); return 1; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. * Return <0 error code on error; 0 when nothing done; 1 when files were * expanded and execution may have blocked. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *fdt; int expanded = 0; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return expanded; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); expanded = 1; wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } /* All good, so we try */ files->resize_in_progress = true; expanded = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); return expanded; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->close_on_exec); } static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->open_fds); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } static unsigned int count_open_files(struct fdtable *fdt) { unsigned int size = fdt->max_fds; unsigned int i; /* Find the last open fd */ for (i = size / BITS_PER_LONG; i > 0; ) { if (fdt->open_fds[--i]) break; } i = (i + 1) * BITS_PER_LONG; return i; } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * * 'max_fds' will normally already be properly aligned, but it * turns out that in the close_range() -> __close_range() -> * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end * up having a 'max_fds' value that isn't already aligned. * * Rather than make close_range() have to worry about this, * just make that BITS_PER_LONG alignment be part of a sane * fdtable size. Becuase that's really what it is. */ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) { unsigned int count; count = count_open_files(fdt); if (max_fds < NR_OPEN_DEFAULT) max_fds = NR_OPEN_DEFAULT; return ALIGN(min(count, max_fds), BITS_PER_LONG); } /* * Allocate a new files structure and copy contents from the * passed in files structure. * errorp will be valid only when the returned files_struct is NULL. */ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) goto out; atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, max_fds); /* * Check whether we need to allocate a larger fd array and fd set. */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files - 1); if (!new_fdt) { *errorp = -ENOMEM; goto out_release; } /* beyond sysctl_nr_open; nothing to do */ if (unlikely(new_fdt->max_fds < open_files)) { __free_fdtable(new_fdt); *errorp = -EMFILE; goto out_release; } /* * Reacquire the oldf lock and a pointer to its fd table * who knows it may have a new bigger fd table. We need * the latest pointer. */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, max_fds); } copy_fd_bitmaps(new_fdt, old_fdt, open_files); old_fds = old_fdt->fd; new_fds = new_fdt->fd; for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) { get_file(f); } else { /* * The fd may be claimed in the fd bitmap but not yet * instantiated in the files array if a sibling thread * is partway through open(). So make sure that this * fd is available to the new process. */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; out_release: kmem_cache_free(files_cachep, newf); out: return NULL; } static struct fdtable *close_files(struct files_struct * files) { /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned int i, j = 0; for (;;) { unsigned long set; i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); if (file) { filp_close(file, files); cond_resched(); } } i++; set >>= 1; } } return fdt; } struct files_struct *get_files_struct(struct task_struct *task) { struct files_struct *files; task_lock(task); files = task->files; if (files) atomic_inc(&files->count); task_unlock(task); return files; } void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { struct fdtable *fdt = close_files(files); /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } void reset_files_struct(struct files_struct *files) { struct task_struct *tsk = current; struct files_struct *old; old = tsk->files; task_lock(tsk); tsk->files = files; task_unlock(tsk); put_files_struct(old); } void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; task_unlock(tsk); put_files_struct(files); } } struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit > maxfd) return maxfd; if (bitbit > start) start = bitbit; return find_next_zero_bit(fdt->open_fds, maxfd, start); } /* * allocate a file descriptor, mark it busy. */ int __alloc_fd(struct files_struct *files, unsigned start, unsigned end, unsigned flags) { unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (fd < fdt->max_fds) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (fd >= end) goto out; error = expand_files(files, fd); if (error < 0) goto out; /* * If we needed to expand the fs array we * might have blocked - try again. */ if (error) goto repeat; if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); error = fd; #if 1 /* Sanity check */ if (rcu_access_pointer(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); rcu_assign_pointer(fdt->fd[fd], NULL); } #endif out: spin_unlock(&files->file_lock); return error; } static int alloc_fd(unsigned start, unsigned flags) { return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); } int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { return __alloc_fd(current->files, 0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(put_unused_fd); /* * Install a file pointer in the fd array. * * The VFS is full of places where we drop the files lock between * setting the open_fds bitmap and installing the file in the file * array. At any such point, we are vulnerable to a dup2() race * installing a file in the array before us. We need to detect this and * fput() the struct file we are about to overwrite in this case. * * It should never happen - if we allow dup2() do it, _really_ bad things * will follow. * * NOTE: __fd_install() variant is really, really low-level; don't * use it unless you are forced to by truly lousy API shoved down * your throat. 'files' *MUST* be either current->files or obtained * by get_files_struct(current) done by whoever had given it to you, * or really bad things will happen. Normally you want to use * fd_install() instead. */ void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { struct fdtable *fdt; rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } /* * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ void fd_install(unsigned int fd, struct file *file) { __fd_install(current->files, fd, file); } EXPORT_SYMBOL(fd_install); static struct file *pick_file(struct files_struct *files, unsigned fd) { struct file *file = NULL; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); if (fd >= fdt->max_fds) goto out_unlock; fd = array_index_nospec(fd, fdt->max_fds); file = fdt->fd[fd]; if (!file) goto out_unlock; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); out_unlock: spin_unlock(&files->file_lock); return file; } /* * The same warnings as for __alloc_fd()/__fd_install() apply here... */ int __close_fd(struct files_struct *files, unsigned fd) { struct file *file; file = pick_file(files, fd); if (!file) return -EBADF; return filp_close(file, files); } EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ /** * __close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. */ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) { unsigned int cur_max; struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; if (flags & ~CLOSE_RANGE_UNSHARE) return -EINVAL; if (fd > max_fd) return -EINVAL; rcu_read_lock(); cur_max = files_fdtable(cur_fds)->max_fds; rcu_read_unlock(); /* cap to last valid index into fdtable */ cur_max--; if (flags & CLOSE_RANGE_UNSHARE) { int ret; unsigned int max_unshare_fds = NR_OPEN_MAX; /* * If the requested range is greater than the current maximum, * we're closing everything so only copy all file descriptors * beneath the lowest file descriptor. */ if (max_fd >= cur_max) max_unshare_fds = fd; ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); if (ret) return ret; /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ if (fds) swap(cur_fds, fds); } max_fd = min(max_fd, cur_max); while (fd <= max_fd) { struct file *file; file = pick_file(cur_fds, fd++); if (!file) continue; filp_close(file, cur_fds); cond_resched(); } if (fds) { /* * We're done closing the files we were supposed to. Time to install * the new file descriptor table and drop the old one. */ task_lock(me); me->files = cur_fds; task_unlock(me); put_files_struct(fds); } return 0; } /* * See close_fd_get_file() below, this variant assumes current->files->file_lock * is held. */ int __close_fd_get_file(unsigned int fd, struct file **res) { struct files_struct *files = current->files; struct file *file; struct fdtable *fdt; fdt = files_fdtable(files); if (fd >= fdt->max_fds) goto out_err; file = fdt->fd[fd]; if (!file) goto out_err; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); get_file(file); *res = file; return 0; out_err: *res = NULL; return -ENOENT; } /* * variant of close_fd that gets a ref on the file for later fput. * The caller must ensure that filp_close() called on the file, and then * an fput(). */ int close_fd_get_file(unsigned int fd, struct file **res) { struct files_struct *files = current->files; int ret; spin_lock(&files->file_lock); ret = __close_fd_get_file(fd, res); spin_unlock(&files->file_lock); return ret; } void do_close_on_exec(struct files_struct *files) { unsigned i; struct fdtable *fdt; /* exec unshares first */ spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; unsigned fd = i * BITS_PER_LONG; fdt = files_fdtable(files); if (fd >= fdt->max_fds) break; set = fdt->close_on_exec[i]; if (!set) continue; fdt->close_on_exec[i] = 0; for ( ; set ; fd++, set >>= 1) { struct file *file; if (!(set & 1)) continue; file = fdt->fd[fd]; if (!file) continue; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask, unsigned int refs) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; if (unlikely(fd >= fdt->max_fds)) return NULL; fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); file = rcu_dereference_raw(*fdentry); if (unlikely(!file)) return NULL; if (unlikely(file->f_mode & mask)) return NULL; /* * Ok, we have a file pointer. However, because we do * this all locklessly under RCU, we may be racing with * that file being closed. * * Such a race can take two forms: * * (a) the file ref already went down to zero, * and get_file_rcu_many() fails. Just try * again: */ if (unlikely(!get_file_rcu_many(file, refs))) continue; /* * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * * If so, we need to put our refs and try again. */ if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || unlikely(rcu_dereference_raw(*fdentry) != file)) { fput_many(file, refs); continue; } /* * Ok, we have a ref to the file, and checked that it * still exists. */ return file; } } static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask, unsigned int refs) { struct file *file; rcu_read_lock(); file = __fget_files_rcu(files, fd, mask, refs); rcu_read_unlock(); return file; } static inline struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) { return __fget_files(current->files, fd, mask, refs); } struct file *fget_many(unsigned int fd, unsigned int refs) { return __fget(fd, FMODE_PATH, refs); } struct file *fget(unsigned int fd) { return __fget(fd, FMODE_PATH, 1); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { return __fget(fd, 0, 1); } EXPORT_SYMBOL(fget_raw); struct file *fget_task(struct task_struct *task, unsigned int fd) { struct file *file = NULL; task_lock(task); if (task->files) file = __fget_files(task->files, fd, 0, 1); task_unlock(task); return file; } /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. */ static unsigned long __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; if (atomic_read(&files->count) == 1) { file = __fcheck_files(files, fd); if (!file || unlikely(file->f_mode & mask)) return 0; return (unsigned long)file; } else { file = __fget(fd, mask, 1); if (!file) return 0; return FDPUT_FPUT | (unsigned long)file; } } unsigned long __fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } EXPORT_SYMBOL(__fdget); unsigned long __fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } /* * Try to avoid f_pos locking. We only need it if the * file is marked for FMODE_ATOMIC_POS, and it can be * accessed multiple ways. * * Always do it for directories, because pidfd_getfd() * can make a file accessible even if it otherwise would * not be, and for directories this is a correctness * issue, not a "POSIX requirement". */ static inline bool file_needs_f_pos_lock(struct file *file) { return (file->f_mode & FMODE_ATOMIC_POS) && (file_count(file) > 1 || S_ISDIR(file_inode(file)->i_mode)); } unsigned long __fdget_pos(unsigned int fd) { unsigned long v = __fdget(fd); struct file *file = (struct file *)(v & ~3); if (file && file_needs_f_pos_lock(file)) { v |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } return v; } void __f_unlock_pos(struct file *f) { mutex_unlock(&f->f_pos_lock); } /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated * file count (done either by fdget() or by fork()). */ void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); if (flag) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { struct files_struct *files = current->files; struct fdtable *fdt; bool res; rcu_read_lock(); fdt = files_fdtable(files); res = close_on_exec(fd, fdt); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* * We need to detect attempts to do dup2() over allocated but still * not finished descriptor. NB: OpenBSD avoids that at the price of * extra work in their equivalent of fget() - they insert struct * file immediately after grabbing descriptor, mark it larval if * more work (e.g. actual opening) is needed and make sure that * fget() treats larval files as absent. Potentially interesting, * but while extra work in fget() is trivial, locking implications * and amount of surgery on open()-related paths in VFS are not. * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" * deadlocks in rather amusing ways, AFAICS. All of that is out of * scope of POSIX or SUS, since neither considers shared descriptor * tables and this condition does not arise without those. */ fdt = files_fdtable(files); tofree = fdt->fd[fd]; if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); return fd; Ebusy: spin_unlock(&files->file_lock); return -EBUSY; } int replace_fd(unsigned fd, struct file *file, unsigned flags) { int err; struct files_struct *files = current->files; if (!file) return __close_fd(files, fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; return do_dup2(files, file, fd, flags); out_unlock: spin_unlock(&files->file_lock); return err; } /** * __receive_fd() - Install received file into file descriptor table * * @fd: fd to install into (if negative, a new fd will be allocated) * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate * checks and count updates. Optionally writes the fd number to userspace, if * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * * Returns newly install fd or -ve on error. */ int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; error = security_file_receive(file); if (error) return error; if (fd < 0) { new_fd = get_unused_fd_flags(o_flags); if (new_fd < 0) return new_fd; } else { new_fd = fd; } if (ufd) { error = put_user(new_fd, ufd); if (error) { if (fd < 0) put_unused_fd(new_fd); return error; } } if (fd < 0) { fd_install(new_fd, get_file(file)); } else { error = replace_fd(new_fd, file, o_flags); if (error) return error; } /* Bump the sock usage counts, if any. */ __receive_sock(file); return new_fd; } static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; struct files_struct *files = current->files; if ((flags & ~O_CLOEXEC) != 0) return -EINVAL; if (unlikely(oldfd == newfd)) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); file = fcheck(oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { if (err == -EMFILE) goto Ebadf; goto out_unlock; } return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; out_unlock: spin_unlock(&files->file_lock); return err; } SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) { return ksys_dup3(oldfd, newfd, flags); } SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; int retval = oldfd; rcu_read_lock(); if (!fcheck_files(files, oldfd)) retval = -EBADF; rcu_read_unlock(); return retval; } return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; } int f_dupfd(unsigned int from, struct file *file, unsigned flags) { int err; if (from >= rlimit(RLIMIT_NOFILE)) return -EINVAL; err = alloc_fd(from, flags); if (err >= 0) { get_file(file); fd_install(err, file); } return err; } int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; file = rcu_dereference_check_fdtable(files, fdt->fd[n]); if (!file) continue; res = f(p, file, n); if (res) break; } spin_unlock(&files->file_lock); return res; } EXPORT_SYMBOL(iterate_fd);
9 9 9 9 9 9 9 9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0 /* * preemptoff and irqoff tracepoints * * Copyright (C) Joel Fernandes (Google) <joel@joelfernandes.org> */ #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> #include <linux/kprobes.h> #include "trace.h" #define CREATE_TRACE_POINTS #include <trace/events/preemptirq.h> #ifdef CONFIG_TRACE_IRQFLAGS /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); /* * Like trace_hardirqs_on() but without the lockdep invocation. This is * used in the low level entry code where the ordering vs. RCU is important * and lockdep uses a staged approach which splits the lockdep hardirq * tracking into a RCU on and a RCU off section. */ void trace_hardirqs_on_prepare(void) { if (this_cpu_read(tracing_irq_cpu)) { if (!in_nmi()) trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } } EXPORT_SYMBOL(trace_hardirqs_on_prepare); NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { if (!in_nmi()) trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); NOKPROBE_SYMBOL(trace_hardirqs_on); /* * Like trace_hardirqs_off() but without the lockdep invocation. This is * used in the low level entry code where the ordering vs. RCU is important * and lockdep uses a staged approach which splits the lockdep hardirq * tracking into a RCU on and a RCU off section. */ void trace_hardirqs_off_finish(void) { if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); if (!in_nmi()) trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); } } EXPORT_SYMBOL(trace_hardirqs_off_finish); NOKPROBE_SYMBOL(trace_hardirqs_off_finish); void trace_hardirqs_off(void) { lockdep_hardirqs_off(CALLER_ADDR0); if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); } } EXPORT_SYMBOL(trace_hardirqs_off); NOKPROBE_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { if (this_cpu_read(tracing_irq_cpu)) { if (!in_nmi()) trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_on(CALLER_ADDR0, caller_addr); this_cpu_write(tracing_irq_cpu, 0); } lockdep_hardirqs_on_prepare(caller_addr); lockdep_hardirqs_on(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { lockdep_hardirqs_off(caller_addr); if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, caller_addr); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); } } EXPORT_SYMBOL(trace_hardirqs_off_caller); NOKPROBE_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_TRACE_IRQFLAGS */ #ifdef CONFIG_TRACE_PREEMPT_TOGGLE void trace_preempt_on(unsigned long a0, unsigned long a1) { if (!in_nmi()) trace_preempt_enable_rcuidle(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { if (!in_nmi()) trace_preempt_disable_rcuidle(a0, a1); tracer_preempt_off(a0, a1); } #endif
1 1 1 1 2 1 2 2 2 2 1 1 1 1 1 2 1 2 2 2 1 1 2 1 2 2 2 2 2 9 1 8 8 2 1 6 3 5 9 3 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/sys.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/export.h> #include <linux/mm.h> #include <linux/utsname.h> #include <linux/mman.h> #include <linux/reboot.h> #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> #include <linux/kmod.h> #include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> #include <linux/workqueue.h> #include <linux/capability.h> #include <linux/device.h> #include <linux/key.h> #include <linux/times.h> #include <linux/posix-timers.h> #include <linux/security.h> #include <linux/dcookies.h> #include <linux/suspend.h> #include <linux/tty.h> #include <linux/signal.h> #include <linux/cn_proc.h> #include <linux/getcpu.h> #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> #include <linux/cpu.h> #include <linux/personality.h> #include <linux/ptrace.h> #include <linux/fs_struct.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/version.h> #include <linux/ctype.h> #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> #include <linux/time_namespace.h> #include <linux/binfmts.h> #include <linux/sched.h> #include <linux/sched/autogroup.h> #include <linux/sched/loadavg.h> #include <linux/sched/stat.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> #include <linux/sched/cputime.h> #include <linux/rcupdate.h> #include <linux/uidgid.h> #include <linux/cred.h> #include <linux/nospec.h> #include <linux/kmsg_dump.h> /* Move somewhere else to avoid recompiling? */ #include <generated/utsrelease.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/unistd.h> #include "uid16.h" #ifndef SET_UNALIGN_CTL # define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef GET_UNALIGN_CTL # define GET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEMU_CTL # define SET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEMU_CTL # define GET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEXC_CTL # define SET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEXC_CTL # define GET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_ENDIAN # define GET_ENDIAN(a, b) (-EINVAL) #endif #ifndef SET_ENDIAN # define SET_ENDIAN(a, b) (-EINVAL) #endif #ifndef GET_TSC_CTL # define GET_TSC_CTL(a) (-EINVAL) #endif #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif #ifndef GET_FP_MODE # define GET_FP_MODE(a) (-EINVAL) #endif #ifndef SET_FP_MODE # define SET_FP_MODE(a,b) (-EINVAL) #endif #ifndef SVE_SET_VL # define SVE_SET_VL(a) (-EINVAL) #endif #ifndef SVE_GET_VL # define SVE_GET_VL() (-EINVAL) #endif #ifndef PAC_RESET_KEYS # define PAC_RESET_KEYS(a, b) (-EINVAL) #endif #ifndef SET_TAGGED_ADDR_CTRL # define SET_TAGGED_ADDR_CTRL(a) (-EINVAL) #endif #ifndef GET_TAGGED_ADDR_CTRL # define GET_TAGGED_ADDR_CTRL() (-EINVAL) #endif /* * this is where the system-wide overflow UID and GID are defined, for * architectures that now have 32-bit UID/GID but didn't in the past */ int overflowuid = DEFAULT_OVERFLOWUID; int overflowgid = DEFAULT_OVERFLOWGID; EXPORT_SYMBOL(overflowuid); EXPORT_SYMBOL(overflowgid); /* * the same as above, but for filesystems which can only store a 16-bit * UID and GID. as such, this is needed on all architectures */ int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); /* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. * * Called with rcu_read_lock, creds are safe */ static bool set_one_prio_perm(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred = __task_cred(p); if (uid_eq(pcred->uid, cred->euid) || uid_eq(pcred->euid, cred->euid)) return true; if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) return true; return false; } /* * set the priority of a task * - the caller must hold the RCU read lock */ static int set_one_prio(struct task_struct *p, int niceval, int error) { int no_nice; if (!set_one_prio_perm(p)) { error = -EPERM; goto out; } if (niceval < task_nice(p) && !can_nice(p, niceval)) { error = -EACCES; goto out; } no_nice = security_task_setnice(p, niceval); if (no_nice) { error = no_nice; goto out; } if (error == -ESRCH) error = 0; set_user_nice(p, niceval); out: return error; } SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) { struct task_struct *g, *p; struct user_struct *user; const struct cred *cred = current_cred(); int error = -EINVAL; struct pid *pgrp; kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) goto out; /* normalize: avoid signed division (rounding problems) */ error = -ESRCH; if (niceval < MIN_NICE) niceval = MIN_NICE; if (niceval > MAX_NICE) niceval = MAX_NICE; rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: if (who) p = find_task_by_vpid(who); else p = current; if (p) error = set_one_prio(p, niceval, error); break; case PRIO_PGRP: if (who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) uid = cred->uid; else if (!uid_eq(uid, cred->uid)) { user = find_user(uid); if (!user) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } out_unlock: read_unlock(&tasklist_lock); rcu_read_unlock(); out: return error; } /* * Ugh. To avoid negative return values, "getpriority()" will * not return the normal nice-value, but a negated value that * has been offset by 20 (ie it returns 40..1 instead of -20..19) * to stay compatible. */ SYSCALL_DEFINE2(getpriority, int, which, int, who) { struct task_struct *g, *p; struct user_struct *user; const struct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp; kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: if (who) p = find_task_by_vpid(who); else p = current; if (p) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } break; case PRIO_PGRP: if (who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) uid = cred->uid; else if (!uid_eq(uid, cred->uid)) { user = find_user(uid); if (!user) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } } while_each_thread(g, p); if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } out_unlock: read_unlock(&tasklist_lock); rcu_read_unlock(); return retval; } /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) * * If you set the real gid at all, or set the effective gid to a value not * equal to the real gid, then the saved gid is set to the new effective gid. * * This makes it possible for a setgid program to completely drop its * privileges, which is often a useful assertion to make when you are doing * a security audit over a program. * * The general idea is that a program which uses just setregid() will be * 100% compatible with BSD. A program which uses just setgid() will be * 100% compatible with POSIX with saved IDs. * * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ #ifdef CONFIG_MULTIUSER long __sys_setregid(gid_t rgid, gid_t egid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kgid_t krgid, kegid; krgid = make_kgid(ns, rgid); kegid = make_kgid(ns, egid); if ((rgid != (gid_t) -1) && !gid_valid(krgid)) return -EINVAL; if ((egid != (gid_t) -1) && !gid_valid(kegid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (rgid != (gid_t) -1) { if (gid_eq(old->gid, krgid) || gid_eq(old->egid, krgid) || ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = krgid; else goto error; } if (egid != (gid_t) -1) { if (gid_eq(old->gid, kegid) || gid_eq(old->egid, kegid) || gid_eq(old->sgid, kegid) || ns_capable_setid(old->user_ns, CAP_SETGID)) new->egid = kegid; else goto error; } if (rgid != (gid_t) -1 || (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) new->sgid = new->egid; new->fsgid = new->egid; retval = security_task_fix_setgid(new, old, LSM_SETID_RE); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { return __sys_setregid(rgid, egid); } /* * setgid() is implemented like SysV w/ SAVED_IDS * * SMP: Same implicit races as above. */ long __sys_setgid(gid_t gid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kgid_t kgid; kgid = make_kgid(ns, gid); if (!gid_valid(kgid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = kgid; else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) new->egid = new->fsgid = kgid; else goto error; retval = security_task_fix_setgid(new, old, LSM_SETID_ID); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE1(setgid, gid_t, gid) { return __sys_setgid(gid); } /* * change the user struct in a credentials set to match the new UID */ static int set_user(struct cred *new) { struct user_struct *new_user; new_user = alloc_uid(new->uid); if (!new_user) return -EAGAIN; /* * We don't fail in case of NPROC limit excess here because too many * poorly written programs don't check set*uid() return code, assuming * it never fails if called by root. We may still enforce NPROC limit * for programs doing set*uid()+execve() by harmlessly deferring the * failure to the execve() stage. */ if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && new_user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; free_uid(new->user); new->user = new_user; return 0; } /* * Unprivileged users may change the real uid to the effective uid * or vice versa. (BSD-style) * * If you set the real uid at all, or set the effective uid to a value not * equal to the real uid, then the saved uid is set to the new effective uid. * * This makes it possible for a setuid program to completely drop its * privileges, which is often a useful assertion to make when you are doing * a security audit over a program. * * The general idea is that a program which uses just setreuid() will be * 100% compatible with BSD. A program which uses just setuid() will be * 100% compatible with POSIX with saved IDs. */ long __sys_setreuid(uid_t ruid, uid_t euid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kuid_t kruid, keuid; kruid = make_kuid(ns, ruid); keuid = make_kuid(ns, euid); if ((ruid != (uid_t) -1) && !uid_valid(kruid)) return -EINVAL; if ((euid != (uid_t) -1) && !uid_valid(keuid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (ruid != (uid_t) -1) { new->uid = kruid; if (!uid_eq(old->uid, kruid) && !uid_eq(old->euid, kruid) && !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } if (euid != (uid_t) -1) { new->euid = keuid; if (!uid_eq(old->uid, keuid) && !uid_eq(old->euid, keuid) && !uid_eq(old->suid, keuid) && !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } if (!uid_eq(new->uid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) new->suid = new->euid; new->fsuid = new->euid; retval = security_task_fix_setuid(new, old, LSM_SETID_RE); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { return __sys_setreuid(ruid, euid); } /* * setuid() is implemented like SysV with SAVED_IDS * * Note that SAVED_ID's is deficient in that a setuid root program * like sendmail, for example, cannot set its uid to be a normal * user and then switch back, because if you're root, setuid() sets * the saved uid too. If you don't like this, blame the bright people * in the POSIX committee and/or USG. Note that the BSD-style setreuid() * will allow a root program to temporarily drop privileges and be able to * regain them by swapping the real and effective uid. */ long __sys_setuid(uid_t uid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kuid_t kuid; kuid = make_kuid(ns, uid); if (!uid_valid(kuid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (ns_capable_setid(old->user_ns, CAP_SETUID)) { new->suid = new->uid = kuid; if (!uid_eq(kuid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { goto error; } new->fsuid = new->euid = kuid; retval = security_task_fix_setuid(new, old, LSM_SETID_ID); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE1(setuid, uid_t, uid) { return __sys_setuid(uid); } /* * This function implements a generic ability to update ruid, euid, * and suid. This allows you to implement the 4.4 compatible seteuid(). */ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kuid_t kruid, keuid, ksuid; bool ruid_new, euid_new, suid_new; kruid = make_kuid(ns, ruid); keuid = make_kuid(ns, euid); ksuid = make_kuid(ns, suid); if ((ruid != (uid_t) -1) && !uid_valid(kruid)) return -EINVAL; if ((euid != (uid_t) -1) && !uid_valid(keuid)) return -EINVAL; if ((suid != (uid_t) -1) && !uid_valid(ksuid)) return -EINVAL; old = current_cred(); /* check for no-op */ if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) && (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) && uid_eq(keuid, old->fsuid))) && (suid == (uid_t) -1 || uid_eq(ksuid, old->suid))) return 0; ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid); euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid); suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid); if ((ruid_new || euid_new || suid_new) && !ns_capable_setid(old->user_ns, CAP_SETUID)) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (ruid != (uid_t) -1) { new->uid = kruid; if (!uid_eq(kruid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } } if (euid != (uid_t) -1) new->euid = keuid; if (suid != (uid_t) -1) new->suid = ksuid; new->fsuid = new->euid; retval = security_task_fix_setuid(new, old, LSM_SETID_RES); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) { return __sys_setresuid(ruid, euid, suid); } SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) { const struct cred *cred = current_cred(); int retval; uid_t ruid, euid, suid; ruid = from_kuid_munged(cred->user_ns, cred->uid); euid = from_kuid_munged(cred->user_ns, cred->euid); suid = from_kuid_munged(cred->user_ns, cred->suid); retval = put_user(ruid, ruidp); if (!retval) { retval = put_user(euid, euidp); if (!retval) return put_user(suid, suidp); } return retval; } /* * Same as above, but for rgid, egid, sgid. */ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kgid_t krgid, kegid, ksgid; bool rgid_new, egid_new, sgid_new; krgid = make_kgid(ns, rgid); kegid = make_kgid(ns, egid); ksgid = make_kgid(ns, sgid); if ((rgid != (gid_t) -1) && !gid_valid(krgid)) return -EINVAL; if ((egid != (gid_t) -1) && !gid_valid(kegid)) return -EINVAL; if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) return -EINVAL; old = current_cred(); /* check for no-op */ if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) && (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) && gid_eq(kegid, old->fsgid))) && (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid))) return 0; rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid); egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid); sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid); if ((rgid_new || egid_new || sgid_new) && !ns_capable_setid(old->user_ns, CAP_SETGID)) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (rgid != (gid_t) -1) new->gid = krgid; if (egid != (gid_t) -1) new->egid = kegid; if (sgid != (gid_t) -1) new->sgid = ksgid; new->fsgid = new->egid; retval = security_task_fix_setgid(new, old, LSM_SETID_RES); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) { return __sys_setresgid(rgid, egid, sgid); } SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) { const struct cred *cred = current_cred(); int retval; gid_t rgid, egid, sgid; rgid = from_kgid_munged(cred->user_ns, cred->gid); egid = from_kgid_munged(cred->user_ns, cred->egid); sgid = from_kgid_munged(cred->user_ns, cred->sgid); retval = put_user(rgid, rgidp); if (!retval) { retval = put_user(egid, egidp); if (!retval) retval = put_user(sgid, sgidp); } return retval; } /* * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This * is used for "access()" and for the NFS daemon (letting nfsd stay at * whatever uid it wants to). It normally shadows "euid", except when * explicitly set by setfsuid() or for access.. */ long __sys_setfsuid(uid_t uid) { const struct cred *old; struct cred *new; uid_t old_fsuid; kuid_t kuid; old = current_cred(); old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); kuid = make_kuid(old->user_ns, uid); if (!uid_valid(kuid)) return old_fsuid; new = prepare_creds(); if (!new) return old_fsuid; if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || ns_capable_setid(old->user_ns, CAP_SETUID)) { if (!uid_eq(kuid, old->fsuid)) { new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) goto change_okay; } } abort_creds(new); return old_fsuid; change_okay: commit_creds(new); return old_fsuid; } SYSCALL_DEFINE1(setfsuid, uid_t, uid) { return __sys_setfsuid(uid); } /* * Samma på svenska.. */ long __sys_setfsgid(gid_t gid) { const struct cred *old; struct cred *new; gid_t old_fsgid; kgid_t kgid; old = current_cred(); old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); kgid = make_kgid(old->user_ns, gid); if (!gid_valid(kgid)) return old_fsgid; new = prepare_creds(); if (!new) return old_fsgid; if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || ns_capable_setid(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0) goto change_okay; } } abort_creds(new); return old_fsgid; change_okay: commit_creds(new); return old_fsgid; } SYSCALL_DEFINE1(setfsgid, gid_t, gid) { return __sys_setfsgid(gid); } #endif /* CONFIG_MULTIUSER */ /** * sys_getpid - return the thread group id of the current process * * Note, despite the name, this returns the tgid not the pid. The tgid and * the pid are identical unless CLONE_THREAD was specified on clone() in * which case the tgid is the same in all threads of the same group. * * This is SMP safe as current->tgid does not change. */ SYSCALL_DEFINE0(getpid) { return task_tgid_vnr(current); } /* Thread ID - the internal kernel "pid" */ SYSCALL_DEFINE0(gettid) { return task_pid_vnr(current); } /* * Accessing ->real_parent is not SMP-safe, it could * change from under us. However, we can use a stale * value of ->real_parent under rcu_read_lock(), see * release_task()->call_rcu(delayed_put_task_struct). */ SYSCALL_DEFINE0(getppid) { int pid; rcu_read_lock(); pid = task_tgid_vnr(rcu_dereference(current->real_parent)); rcu_read_unlock(); return pid; } SYSCALL_DEFINE0(getuid) { /* Only we change this so SMP safe */ return from_kuid_munged(current_user_ns(), current_uid()); } SYSCALL_DEFINE0(geteuid) { /* Only we change this so SMP safe */ return from_kuid_munged(current_user_ns(), current_euid()); } SYSCALL_DEFINE0(getgid) { /* Only we change this so SMP safe */ return from_kgid_munged(current_user_ns(), current_gid()); } SYSCALL_DEFINE0(getegid) { /* Only we change this so SMP safe */ return from_kgid_munged(current_user_ns(), current_egid()); } static void do_sys_times(struct tms *tms) { u64 tgutime, tgstime, cutime, cstime; thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; tms->tms_utime = nsec_to_clock_t(tgutime); tms->tms_stime = nsec_to_clock_t(tgstime); tms->tms_cutime = nsec_to_clock_t(cutime); tms->tms_cstime = nsec_to_clock_t(cstime); } SYSCALL_DEFINE1(times, struct tms __user *, tbuf) { if (tbuf) { struct tms tmp; do_sys_times(&tmp); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } force_successful_syscall_return(); return (long) jiffies_64_to_clock_t(get_jiffies_64()); } #ifdef CONFIG_COMPAT static compat_clock_t clock_t_to_compat_clock_t(clock_t x) { return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); } COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) { if (tbuf) { struct tms tms; struct compat_tms tmp; do_sys_times(&tms); /* Convert our struct tms to the compat version. */ tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } force_successful_syscall_return(); return compat_jiffies_to_clock_t(jiffies); } #endif /* * This needs some heavy checking ... * I just haven't the stomach for it. I also don't fully * understand sessions/pgrp etc. Let somebody who does explain it. * * OK, I think I have the protection semantics right.... this is really * only important on a multi-user system anyway, to make sure one user * can't send a signal to a process owned by another. -TYT, 12/12/91 * * !PF_FORKNOEXEC check to conform completely to POSIX. */ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; struct pid *pgrp; int err; if (!pid) pid = task_pid_vnr(group_leader); if (!pgid) pgid = pid; if (pgid < 0) return -EINVAL; rcu_read_lock(); /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM */ write_lock_irq(&tasklist_lock); err = -ESRCH; p = find_task_by_vpid(pid); if (!p) goto out; err = -EINVAL; if (!thread_group_leader(p)) goto out; if (same_thread_group(p->real_parent, group_leader)) { err = -EPERM; if (task_session(p) != task_session(group_leader)) goto out; err = -EACCES; if (!(p->flags & PF_FORKNOEXEC)) goto out; } else { err = -ESRCH; if (p != group_leader) goto out; } err = -EPERM; if (p->signal->leader) goto out; pgrp = task_pid(p); if (pgid != pid) { struct task_struct *g; pgrp = find_vpid(pgid); g = pid_task(pgrp, PIDTYPE_PGID); if (!g || task_session(g) != task_session(group_leader)) goto out; } err = security_task_setpgid(p, pgid); if (err) goto out; if (task_pgrp(p) != pgrp) change_pid(p, PIDTYPE_PGID, pgrp); err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); rcu_read_unlock(); return err; } static int do_getpgid(pid_t pid) { struct task_struct *p; struct pid *grp; int retval; rcu_read_lock(); if (!pid) grp = task_pgrp(current); else { retval = -ESRCH; p = find_task_by_vpid(pid); if (!p) goto out; grp = task_pgrp(p); if (!grp) goto out; retval = security_task_getpgid(p); if (retval) goto out; } retval = pid_vnr(grp); out: rcu_read_unlock(); return retval; } SYSCALL_DEFINE1(getpgid, pid_t, pid) { return do_getpgid(pid); } #ifdef __ARCH_WANT_SYS_GETPGRP SYSCALL_DEFINE0(getpgrp) { return do_getpgid(0); } #endif SYSCALL_DEFINE1(getsid, pid_t, pid) { struct task_struct *p; struct pid *sid; int retval; rcu_read_lock(); if (!pid) sid = task_session(current); else { retval = -ESRCH; p = find_task_by_vpid(pid); if (!p) goto out; sid = task_session(p); if (!sid) goto out; retval = security_task_getsid(p); if (retval) goto out; } retval = pid_vnr(sid); out: rcu_read_unlock(); return retval; } static void set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; if (task_session(curr) != pid) change_pid(curr, PIDTYPE_SID, pid); if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); } int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); pid_t session = pid_vnr(sid); int err = -EPERM; write_lock_irq(&tasklist_lock); /* Fail if I am already a session leader */ if (group_leader->signal->leader) goto out; /* Fail if a process group id already exists that equals the * proposed session id. */ if (pid_task(sid, PIDTYPE_PGID)) goto out; group_leader->signal->leader = 1; set_special_pids(sid); proc_clear_tty(group_leader); err = session; out: write_unlock_irq(&tasklist_lock); if (err > 0) { proc_sid_connector(group_leader); sched_autogroup_create_attach(group_leader); } return err; } SYSCALL_DEFINE0(setsid) { return ksys_setsid(); } DECLARE_RWSEM(uts_sem); #ifdef COMPAT_UTS_MACHINE #define override_architecture(name) \ (personality(current->personality) == PER_LINUX32 && \ copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ sizeof(COMPAT_UTS_MACHINE))) #else #define override_architecture(name) 0 #endif /* * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be * 2.6.60. */ static int override_release(char __user *release, size_t len) { int ret = 0; if (current->personality & UNAME26) { const char *rest = UTS_RELEASE; char buf[65] = { 0 }; int ndots = 0; unsigned v; size_t copy; while (*rest) { if (*rest == '.' && ++ndots >= 3) break; if (!isdigit(*rest) && *rest != '.') break; rest++; } v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60; copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); } return ret; } SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { struct new_utsname tmp; down_read(&uts_sem); memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); if (copy_to_user(name, &tmp, sizeof(tmp))) return -EFAULT; if (override_release(name->release, sizeof(name->release))) return -EFAULT; if (override_architecture(name)) return -EFAULT; return 0; } #ifdef __ARCH_WANT_SYS_OLD_UNAME /* * Old cruft */ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) { struct old_utsname tmp; if (!name) return -EFAULT; down_read(&uts_sem); memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); if (copy_to_user(name, &tmp, sizeof(tmp))) return -EFAULT; if (override_release(name->release, sizeof(name->release))) return -EFAULT; if (override_architecture(name)) return -EFAULT; return 0; } SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) { struct oldold_utsname tmp; if (!name) return -EFAULT; memset(&tmp, 0, sizeof(tmp)); down_read(&uts_sem); memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN); memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN); memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN); memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN); up_read(&uts_sem); if (copy_to_user(name, &tmp, sizeof(tmp))) return -EFAULT; if (override_architecture(name)) return -EFAULT; if (override_release(name->release, sizeof(name->release))) return -EFAULT; return 0; } #endif SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { struct new_utsname *u; down_write(&uts_sem); u = utsname(); memcpy(u->nodename, tmp, len); memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); } return errno; } #ifdef __ARCH_WANT_SYS_GETHOSTNAME SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) { int i; struct new_utsname *u; char tmp[__NEW_UTS_LEN + 1]; if (len < 0) return -EINVAL; down_read(&uts_sem); u = utsname(); i = 1 + strlen(u->nodename); if (i > len) i = len; memcpy(tmp, u->nodename, i); up_read(&uts_sem); if (copy_to_user(name, tmp, i)) return -EFAULT; return 0; } #endif /* * Only setdomainname; getdomainname can be implemented by calling * uname() */ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { struct new_utsname *u; down_write(&uts_sem); u = utsname(); memcpy(u->domainname, tmp, len); memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); } return errno; } SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit value; int ret; ret = do_prlimit(current, resource, NULL, &value); if (!ret) ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; return ret; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { struct rlimit r; struct compat_rlimit r32; if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit))) return -EFAULT; if (r32.rlim_cur == COMPAT_RLIM_INFINITY) r.rlim_cur = RLIM_INFINITY; else r.rlim_cur = r32.rlim_cur; if (r32.rlim_max == COMPAT_RLIM_INFINITY) r.rlim_max = RLIM_INFINITY; else r.rlim_max = r32.rlim_max; return do_prlimit(current, resource, &r, NULL); } COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { struct rlimit r; int ret; ret = do_prlimit(current, resource, NULL, &r); if (!ret) { struct compat_rlimit r32; if (r.rlim_cur > COMPAT_RLIM_INFINITY) r32.rlim_cur = COMPAT_RLIM_INFINITY; else r32.rlim_cur = r.rlim_cur; if (r.rlim_max > COMPAT_RLIM_INFINITY) r32.rlim_max = COMPAT_RLIM_INFINITY; else r32.rlim_max = r.rlim_max; if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit))) return -EFAULT; } return ret; } #endif #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT /* * Back compatibility for getrlimit. Needed for some apps. */ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit x; if (resource >= RLIM_NLIMITS) return -EINVAL; resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); x = current->signal->rlim[resource]; task_unlock(current->group_leader); if (x.rlim_cur > 0x7FFFFFFF) x.rlim_cur = 0x7FFFFFFF; if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { struct rlimit r; if (resource >= RLIM_NLIMITS) return -EINVAL; resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); r = current->signal->rlim[resource]; task_unlock(current->group_leader); if (r.rlim_cur > 0x7FFFFFFF) r.rlim_cur = 0x7FFFFFFF; if (r.rlim_max > 0x7FFFFFFF) r.rlim_max = 0x7FFFFFFF; if (put_user(r.rlim_cur, &rlim->rlim_cur) || put_user(r.rlim_max, &rlim->rlim_max)) return -EFAULT; return 0; } #endif #endif static inline bool rlim64_is_infinity(__u64 rlim64) { #if BITS_PER_LONG < 64 return rlim64 >= ULONG_MAX; #else return rlim64 == RLIM64_INFINITY; #endif } static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) { if (rlim->rlim_cur == RLIM_INFINITY) rlim64->rlim_cur = RLIM64_INFINITY; else rlim64->rlim_cur = rlim->rlim_cur; if (rlim->rlim_max == RLIM_INFINITY) rlim64->rlim_max = RLIM64_INFINITY; else rlim64->rlim_max = rlim->rlim_max; } static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) { if (rlim64_is_infinity(rlim64->rlim_cur)) rlim->rlim_cur = RLIM_INFINITY; else rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; if (rlim64_is_infinity(rlim64->rlim_max)) rlim->rlim_max = RLIM_INFINITY; else rlim->rlim_max = (unsigned long)rlim64->rlim_max; } /* make sure you are allowed to change @tsk limits before calling this */ int do_prlimit(struct task_struct *tsk, unsigned int resource, struct rlimit *new_rlim, struct rlimit *old_rlim) { struct rlimit *rlim; int retval = 0; if (resource >= RLIM_NLIMITS) return -EINVAL; resource = array_index_nospec(resource, RLIM_NLIMITS); if (new_rlim) { if (new_rlim->rlim_cur > new_rlim->rlim_max) return -EINVAL; if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open) return -EPERM; } /* protect tsk->signal and tsk->sighand from disappearing */ read_lock(&tasklist_lock); if (!tsk->sighand) { retval = -ESRCH; goto out; } rlim = tsk->signal->rlim + resource; task_lock(tsk->group_leader); if (new_rlim) { /* Keep the capable check against init_user_ns until cgroups can contain all limits */ if (new_rlim->rlim_max > rlim->rlim_max && !capable(CAP_SYS_RESOURCE)) retval = -EPERM; if (!retval) retval = security_task_setrlimit(tsk, resource, new_rlim); } if (!retval) { if (old_rlim) *old_rlim = *rlim; if (new_rlim) *rlim = *new_rlim; } task_unlock(tsk->group_leader); /* * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not * infite. In case of RLIM_INFINITY the posix CPU timer code * ignores the rlimit. */ if (!retval && new_rlim && resource == RLIMIT_CPU && new_rlim->rlim_cur != RLIM_INFINITY && IS_ENABLED(CONFIG_POSIX_TIMERS)) update_rlimit_cpu(tsk, new_rlim->rlim_cur); out: read_unlock(&tasklist_lock); return retval; } /* rcu lock must be held */ static int check_prlimit_permission(struct task_struct *task, unsigned int flags) { const struct cred *cred = current_cred(), *tcred; bool id_match; if (current == task) return 0; tcred = __task_cred(task); id_match = (uid_eq(cred->uid, tcred->euid) && uid_eq(cred->uid, tcred->suid) && uid_eq(cred->uid, tcred->uid) && gid_eq(cred->gid, tcred->egid) && gid_eq(cred->gid, tcred->sgid) && gid_eq(cred->gid, tcred->gid)); if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) return -EPERM; return security_task_prlimit(cred, tcred, flags); } SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, const struct rlimit64 __user *, new_rlim, struct rlimit64 __user *, old_rlim) { struct rlimit64 old64, new64; struct rlimit old, new; struct task_struct *tsk; unsigned int checkflags = 0; int ret; if (old_rlim) checkflags |= LSM_PRLIMIT_READ; if (new_rlim) { if (copy_from_user(&new64, new_rlim, sizeof(new64))) return -EFAULT; rlim64_to_rlim(&new64, &new); checkflags |= LSM_PRLIMIT_WRITE; } rcu_read_lock(); tsk = pid ? find_task_by_vpid(pid) : current; if (!tsk) { rcu_read_unlock(); return -ESRCH; } ret = check_prlimit_permission(tsk, checkflags); if (ret) { rcu_read_unlock(); return ret; } get_task_struct(tsk); rcu_read_unlock(); ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, old_rlim ? &old : NULL); if (!ret && old_rlim) { rlim_to_rlim64(&old, &old64); if (copy_to_user(old_rlim, &old64, sizeof(old64))) ret = -EFAULT; } put_task_struct(tsk); return ret; } SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit new_rlim; if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; return do_prlimit(current, resource, &new_rlim, NULL); } /* * It would make sense to put struct rusage in the task_struct, * except that would make the task_struct be *really big*. After * task_struct gets moved into malloc'ed memory, it would * make sense to do this. It will make moving the rest of the information * a lot simpler! (Which we're not doing right now because we're not * measuring them yet). * * When sampling multiple threads for RUSAGE_SELF, under SMP we might have * races with threads incrementing their own counters. But since word * reads are atomic, we either get new values or old values and we don't * care which for the sums. We always take the siglock to protect reading * the c* fields from p->signal from races with exit.c updating those * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. * * Locking: * We need to take the siglock for CHILDEREN, SELF and BOTH * for the cases current multithreaded, non-current single threaded * non-current multithreaded. Thread traversal is now safe with * the siglock held. * Strictly speaking, we donot need to take the siglock if we are current and * single threaded, as no one else can take our signal_struct away, no one * else can reap the children to update signal->c* counters, and no one else * can race with the signal-> fields. If we do not take any lock, the * signal-> fields could be read out of order while another thread was just * exiting. So we should place a read memory barrier when we avoid the lock. * On the writer side, write memory barrier is implied in __exit_signal * as __exit_signal releases the siglock spinlock after updating the signal-> * fields. But we don't do this yet to keep things simple. * */ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) { r->ru_nvcsw += t->nvcsw; r->ru_nivcsw += t->nivcsw; r->ru_minflt += t->min_flt; r->ru_majflt += t->maj_flt; r->ru_inblock += task_io_get_inblock(t); r->ru_oublock += task_io_get_oublock(t); } void getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; u64 tgutime, tgstime, utime, stime; unsigned long maxrss; struct mm_struct *mm; struct signal_struct *sig = p->signal; unsigned int seq = 0; retry: memset(r, 0, sizeof(*r)); utime = stime = 0; maxrss = 0; if (who == RUSAGE_THREAD) { task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = sig->maxrss; goto out_thread; } flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: utime = sig->cutime; stime = sig->cstime; r->ru_nvcsw = sig->cnvcsw; r->ru_nivcsw = sig->cnivcsw; r->ru_minflt = sig->cmin_flt; r->ru_majflt = sig->cmaj_flt; r->ru_inblock = sig->cinblock; r->ru_oublock = sig->coublock; maxrss = sig->cmaxrss; if (who == RUSAGE_CHILDREN) break; fallthrough; case RUSAGE_SELF: r->ru_nvcsw += sig->nvcsw; r->ru_nivcsw += sig->nivcsw; r->ru_minflt += sig->min_flt; r->ru_majflt += sig->maj_flt; r->ru_inblock += sig->inblock; r->ru_oublock += sig->oublock; if (maxrss < sig->maxrss) maxrss = sig->maxrss; rcu_read_lock(); __for_each_thread(sig, t) accumulate_thread_rusage(t, r); rcu_read_unlock(); break; default: BUG(); } if (need_seqretry(&sig->stats_lock, seq)) { seq = 1; goto retry; } done_seqretry_irqrestore(&sig->stats_lock, seq, flags); if (who == RUSAGE_CHILDREN) goto out_children; thread_group_cputime_adjusted(p, &tgutime, &tgstime); utime += tgutime; stime += tgstime; out_thread: mm = get_task_mm(p); if (mm) { setmax_mm_hiwater_rss(&maxrss, mm); mmput(mm); } out_children: r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ r->ru_utime = ns_to_kernel_old_timeval(utime); r->ru_stime = ns_to_kernel_old_timeval(stime); } SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) { struct rusage r; if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && who != RUSAGE_THREAD) return -EINVAL; getrusage(current, who, &r); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) { struct rusage r; if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && who != RUSAGE_THREAD) return -EINVAL; getrusage(current, who, &r); return put_compat_rusage(&r, ru); } #endif SYSCALL_DEFINE1(umask, int, mask) { mask = xchg(&current->fs->umask, mask & S_IRWXUGO); return mask; } static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; struct file *old_exe, *exe_file; struct inode *inode; int err; exe = fdget(fd); if (!exe.file) return -EBADF; inode = file_inode(exe.file); /* * Because the original mm->exe_file points to executable file, make * sure that this one is executable as well, to avoid breaking an * overall picture. */ err = -EACCES; if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) goto exit; err = inode_permission(inode, MAY_EXEC); if (err) goto exit; /* * Forbid mm->exe_file change if old file still mapped. */ exe_file = get_mm_exe_file(mm); err = -EBUSY; if (exe_file) { struct vm_area_struct *vma; mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!vma->vm_file) continue; if (path_equal(&vma->vm_file->f_path, &exe_file->f_path)) goto exit_err; } mmap_read_unlock(mm); fput(exe_file); } err = 0; /* set the new file, lockless */ get_file(exe.file); old_exe = xchg(&mm->exe_file, exe.file); if (old_exe) fput(old_exe); exit: fdput(exe); return err; exit_err: mmap_read_unlock(mm); fput(exe_file); goto exit; } /* * Check arithmetic relations of passed addresses. * * WARNING: we don't require any capability here so be very careful * in what is allowed for modification from userspace. */ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) { unsigned long mmap_max_addr = TASK_SIZE; int error = -EINVAL, i; static const unsigned char offsets[] = { offsetof(struct prctl_mm_map, start_code), offsetof(struct prctl_mm_map, end_code), offsetof(struct prctl_mm_map, start_data), offsetof(struct prctl_mm_map, end_data), offsetof(struct prctl_mm_map, start_brk), offsetof(struct prctl_mm_map, brk), offsetof(struct prctl_mm_map, start_stack), offsetof(struct prctl_mm_map, arg_start), offsetof(struct prctl_mm_map, arg_end), offsetof(struct prctl_mm_map, env_start), offsetof(struct prctl_mm_map, env_end), }; /* * Make sure the members are not somewhere outside * of allowed address space. */ for (i = 0; i < ARRAY_SIZE(offsets); i++) { u64 val = *(u64 *)((char *)prctl_map + offsets[i]); if ((unsigned long)val >= mmap_max_addr || (unsigned long)val < mmap_min_addr) goto out; } /* * Make sure the pairs are ordered. */ #define __prctl_check_order(__m1, __op, __m2) \ ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); if (error) goto out; #undef __prctl_check_order error = -EINVAL; /* * Neither we should allow to override limits if they set. */ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, prctl_map->start_brk, prctl_map->end_data, prctl_map->start_data)) goto out; error = 0; out: return error; } #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) { struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; unsigned long user_auxv[AT_VECTOR_SIZE]; struct mm_struct *mm = current->mm; int error; BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); if (opt == PR_SET_MM_MAP_SIZE) return put_user((unsigned int)sizeof(prctl_map), (unsigned int __user *)addr); if (data_size != sizeof(prctl_map)) return -EINVAL; if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) return -EFAULT; error = validate_prctl_map_addr(&prctl_map); if (error) return error; if (prctl_map.auxv_size) { /* * Someone is trying to cheat the auxv vector. */ if (!prctl_map.auxv || prctl_map.auxv_size > sizeof(mm->saved_auxv)) return -EINVAL; memset(user_auxv, 0, sizeof(user_auxv)); if (copy_from_user(user_auxv, (const void __user *)prctl_map.auxv, prctl_map.auxv_size)) return -EFAULT; /* Last entry must be AT_NULL as specification requires */ user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } if (prctl_map.exe_fd != (u32)-1) { /* * Check if the current user is checkpoint/restore capable. * At the time of this writing, it checks for CAP_SYS_ADMIN * or CAP_CHECKPOINT_RESTORE. * Note that a user with access to ptrace can masquerade an * arbitrary program as any executable, even setuid ones. * This may have implications in the tomoyo subsystem. */ if (!checkpoint_restore_ns_capable(current_user_ns())) return -EPERM; error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); if (error) return error; } /* * arg_lock protects concurent updates but we still need mmap_lock for * read to exclude races with sys_brk. */ mmap_read_lock(mm); /* * We don't validate if these members are pointing to * real present VMAs because application may have correspond * VMAs already unmapped and kernel uses these members for statistics * output in procfs mostly, except * * - @start_brk/@brk which are used in do_brk_flags but kernel lookups * for VMAs when updating these memvers so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself */ spin_lock(&mm->arg_lock); mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; mm->end_data = prctl_map.end_data; mm->start_brk = prctl_map.start_brk; mm->brk = prctl_map.brk; mm->start_stack = prctl_map.start_stack; mm->arg_start = prctl_map.arg_start; mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; spin_unlock(&mm->arg_lock); /* * Note this update of @saved_auxv is lockless thus * if someone reads this member in procfs while we're * updating -- it may get partly updated results. It's * known and acceptable trade off: we leave it as is to * not introduce additional locks here making the kernel * more complex. */ if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); mmap_read_unlock(mm); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, unsigned long len) { /* * This doesn't move the auxiliary vector itself since it's pinned to * mm_struct, but it permits filling the vector with new values. It's * up to the caller to provide sane values here, otherwise userspace * tools which use this vector might be unhappy. */ unsigned long user_auxv[AT_VECTOR_SIZE]; if (len > sizeof(user_auxv)) return -EINVAL; if (copy_from_user(user_auxv, (const void __user *)addr, len)) return -EFAULT; /* Make sure the last entry is always AT_NULL */ user_auxv[AT_VECTOR_SIZE - 2] = 0; user_auxv[AT_VECTOR_SIZE - 1] = 0; BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); task_lock(current); memcpy(mm->saved_auxv, user_auxv, len); task_unlock(current); return 0; } static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { struct mm_struct *mm = current->mm; struct prctl_mm_map prctl_map = { .auxv = NULL, .auxv_size = 0, .exe_fd = -1, }; struct vm_area_struct *vma; int error; if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && opt != PR_SET_MM_MAP && opt != PR_SET_MM_MAP_SIZE))) return -EINVAL; #ifdef CONFIG_CHECKPOINT_RESTORE if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) return prctl_set_mm_map(opt, (const void __user *)addr, arg4); #endif if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsigned int)addr); if (opt == PR_SET_MM_AUXV) return prctl_set_auxv(mm, addr, arg4); if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; error = -EINVAL; /* * arg_lock protects concurent updates of arg boundaries, we need * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr * validation. */ mmap_read_lock(mm); vma = find_vma(mm, addr); spin_lock(&mm->arg_lock); prctl_map.start_code = mm->start_code; prctl_map.end_code = mm->end_code; prctl_map.start_data = mm->start_data; prctl_map.end_data = mm->end_data; prctl_map.start_brk = mm->start_brk; prctl_map.brk = mm->brk; prctl_map.start_stack = mm->start_stack; prctl_map.arg_start = mm->arg_start; prctl_map.arg_end = mm->arg_end; prctl_map.env_start = mm->env_start; prctl_map.env_end = mm->env_end; switch (opt) { case PR_SET_MM_START_CODE: prctl_map.start_code = addr; break; case PR_SET_MM_END_CODE: prctl_map.end_code = addr; break; case PR_SET_MM_START_DATA: prctl_map.start_data = addr; break; case PR_SET_MM_END_DATA: prctl_map.end_data = addr; break; case PR_SET_MM_START_STACK: prctl_map.start_stack = addr; break; case PR_SET_MM_START_BRK: prctl_map.start_brk = addr; break; case PR_SET_MM_BRK: prctl_map.brk = addr; break; case PR_SET_MM_ARG_START: prctl_map.arg_start = addr; break; case PR_SET_MM_ARG_END: prctl_map.arg_end = addr; break; case PR_SET_MM_ENV_START: prctl_map.env_start = addr; break; case PR_SET_MM_ENV_END: prctl_map.env_end = addr; break; default: goto out; } error = validate_prctl_map_addr(&prctl_map); if (error) goto out; switch (opt) { /* * If command line arguments and environment * are placed somewhere else on stack, we can * set them up here, ARG_START/END to setup * command line argumets and ENV_START/END * for environment. */ case PR_SET_MM_START_STACK: case PR_SET_MM_ARG_START: case PR_SET_MM_ARG_END: case PR_SET_MM_ENV_START: case PR_SET_MM_ENV_END: if (!vma) { error = -EFAULT; goto out; } } mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; mm->end_data = prctl_map.end_data; mm->start_brk = prctl_map.start_brk; mm->brk = prctl_map.brk; mm->start_stack = prctl_map.start_stack; mm->arg_start = prctl_map.arg_start; mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; error = 0; out: spin_unlock(&mm->arg_lock); mmap_read_unlock(mm); return error; } #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return put_user(me->clear_child_tid, tid_addr); } #else static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return -EINVAL; } #endif static int propagate_has_child_subreaper(struct task_struct *p, void *data) { /* * If task has has_child_subreaper - all its decendants * already have these flag too and new decendants will * inherit it on fork, skip them. * * If we've found child_reaper - skip descendants in * it's subtree as they will never get out pidns. */ if (p->signal->has_child_subreaper || is_child_reaper(task_pid(p))) return 0; p->signal->has_child_subreaper = 1; return 1; } int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which) { return -EINVAL; } int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, unsigned long ctrl) { return -EINVAL; } #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { struct task_struct *me = current; unsigned char comm[sizeof(me->comm)]; long error; error = security_task_prctl(option, arg2, arg3, arg4, arg5); if (error != -ENOSYS) return error; error = 0; switch (option) { case PR_SET_PDEATHSIG: if (!valid_signal(arg2)) { error = -EINVAL; break; } me->pdeath_signal = arg2; break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); break; case PR_GET_DUMPABLE: error = get_dumpable(me->mm); break; case PR_SET_DUMPABLE: if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { error = -EINVAL; break; } set_dumpable(me->mm, arg2); break; case PR_SET_UNALIGN: error = SET_UNALIGN_CTL(me, arg2); break; case PR_GET_UNALIGN: error = GET_UNALIGN_CTL(me, arg2); break; case PR_SET_FPEMU: error = SET_FPEMU_CTL(me, arg2); break; case PR_GET_FPEMU: error = GET_FPEMU_CTL(me, arg2); break; case PR_SET_FPEXC: error = SET_FPEXC_CTL(me, arg2); break; case PR_GET_FPEXC: error = GET_FPEXC_CTL(me, arg2); break; case PR_GET_TIMING: error = PR_TIMING_STATISTICAL; break; case PR_SET_TIMING: if (arg2 != PR_TIMING_STATISTICAL) error = -EINVAL; break; case PR_SET_NAME: comm[sizeof(me->comm) - 1] = 0; if (strncpy_from_user(comm, (char __user *)arg2, sizeof(me->comm) - 1) < 0) return -EFAULT; set_task_comm(me, comm); proc_comm_connector(me); break; case PR_GET_NAME: get_task_comm(comm, me); if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) return -EFAULT; break; case PR_GET_ENDIAN: error = GET_ENDIAN(me, arg2); break; case PR_SET_ENDIAN: error = SET_ENDIAN(me, arg2); break; case PR_GET_SECCOMP: error = prctl_get_seccomp(); break; case PR_SET_SECCOMP: error = prctl_set_seccomp(arg2, (char __user *)arg3); break; case PR_GET_TSC: error = GET_TSC_CTL(arg2); break; case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; case PR_TASK_PERF_EVENTS_DISABLE: error = perf_event_task_disable(); break; case PR_TASK_PERF_EVENTS_ENABLE: error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: if (current->timer_slack_ns > ULONG_MAX) error = ULONG_MAX; else error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (arg2 <= 0) current->timer_slack_ns = current->default_timer_slack_ns; else current->timer_slack_ns = arg2; break; case PR_MCE_KILL: if (arg4 | arg5) return -EINVAL; switch (arg2) { case PR_MCE_KILL_CLEAR: if (arg3 != 0) return -EINVAL; current->flags &= ~PF_MCE_PROCESS; break; case PR_MCE_KILL_SET: current->flags |= PF_MCE_PROCESS; if (arg3 == PR_MCE_KILL_EARLY) current->flags |= PF_MCE_EARLY; else if (arg3 == PR_MCE_KILL_LATE) current->flags &= ~PF_MCE_EARLY; else if (arg3 == PR_MCE_KILL_DEFAULT) current->flags &= ~(PF_MCE_EARLY|PF_MCE_PROCESS); else return -EINVAL; break; default: return -EINVAL; } break; case PR_MCE_KILL_GET: if (arg2 | arg3 | arg4 | arg5) return -EINVAL; if (current->flags & PF_MCE_PROCESS) error = (current->flags & PF_MCE_EARLY) ? PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; else error = PR_MCE_KILL_DEFAULT; break; case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; case PR_GET_TID_ADDRESS: error = prctl_get_tid_address(me, (int __user * __user *)arg2); break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; if (!arg2) break; walk_process_tree(me, propagate_has_child_subreaper, NULL); break; case PR_GET_CHILD_SUBREAPER: error = put_user(me->signal->is_child_subreaper, (int __user *)arg2); break; case PR_SET_NO_NEW_PRIVS: if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; task_set_no_new_privs(current); break; case PR_GET_NO_NEW_PRIVS: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) return -EINVAL; if (mmap_write_lock_killable(me->mm)) return -EINTR; if (arg2) set_bit(MMF_DISABLE_THP, &me->mm->flags); else clear_bit(MMF_DISABLE_THP, &me->mm->flags); mmap_write_unlock(me->mm); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: /* No longer implemented: */ return -EINVAL; case PR_SET_FP_MODE: error = SET_FP_MODE(me, arg2); break; case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; case PR_SVE_SET_VL: error = SVE_SET_VL(arg2); break; case PR_SVE_GET_VL: error = SVE_GET_VL(); break; case PR_GET_SPECULATION_CTRL: if (arg3 || arg4 || arg5) return -EINVAL; error = arch_prctl_spec_ctrl_get(me, arg2); break; case PR_SET_SPECULATION_CTRL: if (arg4 || arg5) return -EINVAL; error = arch_prctl_spec_ctrl_set(me, arg2, arg3); break; case PR_PAC_RESET_KEYS: if (arg3 || arg4 || arg5) return -EINVAL; error = PAC_RESET_KEYS(me, arg2); break; case PR_SET_TAGGED_ADDR_CTRL: if (arg3 || arg4 || arg5) return -EINVAL; error = SET_TAGGED_ADDR_CTRL(arg2); break; case PR_GET_TAGGED_ADDR_CTRL: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = GET_TAGGED_ADDR_CTRL(); break; case PR_SET_IO_FLUSHER: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (arg3 || arg4 || arg5) return -EINVAL; if (arg2 == 1) current->flags |= PR_IO_FLUSHER; else if (!arg2) current->flags &= ~PR_IO_FLUSHER; else return -EINVAL; break; case PR_GET_IO_FLUSHER: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; default: error = -EINVAL; break; } return error; } SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, struct getcpu_cache __user *, unused) { int err = 0; int cpu = raw_smp_processor_id(); if (cpup) err |= put_user(cpu, cpup); if (nodep) err |= put_user(cpu_to_node(cpu), nodep); return err ? -EFAULT : 0; } /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill */ static int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; struct timespec64 tp; memset(info, 0, sizeof(struct sysinfo)); ktime_get_boottime_ts64(&tp); timens_add_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); info->procs = nr_threads; si_meminfo(info); si_swapinfo(info); /* * If the sum of all the available memory (i.e. ram + swap) * is less than can be stored in a 32 bit unsigned long then * we can be binary compatible with 2.2.x kernels. If not, * well, in that case 2.2.x was broken anyways... * * -Erik Andersen <andersee@debian.org> */ mem_total = info->totalram + info->totalswap; if (mem_total < info->totalram || mem_total < info->totalswap) goto out; bitcount = 0; mem_unit = info->mem_unit; while (mem_unit > 1) { bitcount++; mem_unit >>= 1; sav_total = mem_total; mem_total <<= 1; if (mem_total < sav_total) goto out; } /* * If mem_total did not overflow, multiply all memory values by * info->mem_unit and set it to 1. This leaves things compatible * with 2.2.x, and also retains compatibility with earlier 2.4.x * kernels... */ info->mem_unit = 1; info->totalram <<= bitcount; info->freeram <<= bitcount; info->sharedram <<= bitcount; info->bufferram <<= bitcount; info->totalswap <<= bitcount; info->freeswap <<= bitcount; info->totalhigh <<= bitcount; info->freehigh <<= bitcount; out: return 0; } SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) { struct sysinfo val; do_sysinfo(&val); if (copy_to_user(info, &val, sizeof(struct sysinfo))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT struct compat_sysinfo { s32 uptime; u32 loads[3]; u32 totalram; u32 freeram; u32 sharedram; u32 bufferram; u32 totalswap; u32 freeswap; u16 procs; u16 pad; u32 totalhigh; u32 freehigh; u32 mem_unit; char _f[20-2*sizeof(u32)-sizeof(int)]; }; COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) { struct sysinfo s; struct compat_sysinfo s_32; do_sysinfo(&s); /* Check to see if any memory value is too large for 32-bit and scale * down if needed */ if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { int bitcount = 0; while (s.mem_unit < PAGE_SIZE) { s.mem_unit <<= 1; bitcount++; } s.totalram >>= bitcount; s.freeram >>= bitcount; s.sharedram >>= bitcount; s.bufferram >>= bitcount; s.totalswap >>= bitcount; s.freeswap >>= bitcount; s.totalhigh >>= bitcount; s.freehigh >>= bitcount; } memset(&s_32, 0, sizeof(s_32)); s_32.uptime = s.uptime; s_32.loads[0] = s.loads[0]; s_32.loads[1] = s.loads[1]; s_32.loads[2] = s.loads[2]; s_32.totalram = s.totalram; s_32.freeram = s.freeram; s_32.sharedram = s.sharedram; s_32.bufferram = s.bufferram; s_32.totalswap = s.totalswap; s_32.freeswap = s.freeswap; s_32.procs = s.procs; s_32.totalhigh = s.totalhigh; s_32.freehigh = s.freehigh; s_32.mem_unit = s.mem_unit; if (copy_to_user(info, &s_32, sizeof(s_32))) return -EFAULT; return 0; } #endif /* CONFIG_COMPAT */
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 /* SPDX-License-Identifier: GPL-2.0 */ /* * Variant of atomic_t specialized for reference counts. * * The interface matches the atomic_t interface (to aid in porting) but only * provides the few functions one should use for reference counting. * * Saturation semantics * ==================== * * refcount_t differs from atomic_t in that the counter saturates at * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the * counter and causing 'spurious' use-after-free issues. In order to avoid the * cost associated with introducing cmpxchg() loops into all of the saturating * operations, we temporarily allow the counter to take on an unchecked value * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow * or overflow has occurred. Although this is racy when multiple threads * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly * equidistant from 0 and INT_MAX we minimise the scope for error: * * INT_MAX REFCOUNT_SATURATED UINT_MAX * 0 (0x7fff_ffff) (0xc000_0000) (0xffff_ffff) * +--------------------------------+----------------+----------------+ * <---------- bad value! ----------> * * (in a signed view of the world, the "bad value" range corresponds to * a negative counter value). * * As an example, consider a refcount_inc() operation that causes the counter * to overflow: * * int old = atomic_fetch_add_relaxed(r); * // old is INT_MAX, refcount now INT_MIN (0x8000_0000) * if (old < 0) * atomic_set(r, REFCOUNT_SATURATED); * * If another thread also performs a refcount_inc() operation between the two * atomic operations, then the count will continue to edge closer to 0. If it * reaches a value of 1 before /any/ of the threads reset it to the saturated * value, then a concurrent refcount_dec_and_test() may erroneously free the * underlying object. * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK). * With the current PID limit, if no batched refcounting operations are used and * the attacker can't repeatedly trigger kernel oopses in the middle of refcount * operations, this makes it impossible for a saturated refcount to leave the * saturation range, even if it is possible for multiple uses of the same * refcount to nest in the context of a single task: * * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT = * 0x40000000 / 0x400000 = 0x100 = 256 * * If hundreds of references are added/removed with a single refcounting * operation, it may potentially be possible to leave the saturation range; but * given the precise timing details involved with the round-robin scheduling of * each thread manipulating the refcount and the need to hit the race multiple * times in succession, there doesn't appear to be a practical avenue of attack * even if using refcount_add() operations with larger increments. * * Memory ordering * =============== * * Memory ordering rules are slightly relaxed wrt regular atomic_t functions * and provide only what is strictly required for refcounts. * * The increments are fully relaxed; these will not provide ordering. The * rationale is that whatever is used to obtain the object we're increasing the * reference count on will provide the ordering. For locked data structures, * its the lock acquire, for RCU/lockless data structures its the dependent * load. * * Do note that inc_not_zero() provides a control dependency which will order * future stores against the inc, this ensures we'll never modify the object * if we did not in fact acquire a reference. * * The decrements will provide release order, such that all the prior loads and * stores will be issued before, it also provides a control dependency, which * will order us against the subsequent free(). * * The control dependency is against the load of the cmpxchg (ll/sc) that * succeeded. This means the stores aren't fully ordered, but this is fine * because the 1->0 transition indicates no concurrency. * * Note that the allocator is responsible for ordering things between free() * and alloc(). * * The decrements dec_and_test() and sub_and_test() also provide acquire * ordering on success. * */ #ifndef _LINUX_REFCOUNT_H #define _LINUX_REFCOUNT_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/limits.h> #include <linux/spinlock_types.h> struct mutex; /** * struct refcount_t - variant of atomic_t specialized for reference counts * @refs: atomic_t counter field * * The counter saturates at REFCOUNT_SATURATED and will not move once * there. This avoids wrapping the counter and causing 'spurious' * use-after-free bugs. */ typedef struct refcount_struct { atomic_t refs; } refcount_t; #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } #define REFCOUNT_MAX INT_MAX #define REFCOUNT_SATURATED (INT_MIN / 2) enum refcount_saturation_type { REFCOUNT_ADD_NOT_ZERO_OVF, REFCOUNT_ADD_OVF, REFCOUNT_ADD_UAF, REFCOUNT_SUB_UAF, REFCOUNT_DEC_LEAK, }; void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t); /** * refcount_set - set a refcount's value * @r: the refcount * @n: value to which the refcount will be set */ static inline void refcount_set(refcount_t *r, int n) { atomic_set(&r->refs, n); } /** * refcount_read - get a refcount's value * @r: the refcount * * Return: the refcount's value */ static inline unsigned int refcount_read(const refcount_t *r) { return atomic_read(&r->refs); } static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); do { if (!old) break; } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i)); if (oldp) *oldp = old; if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); return old; } /** * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount * @r: the refcount * * Will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. * * Return: false if the passed refcount is 0, true otherwise */ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) { return __refcount_add_not_zero(i, r, NULL); } static inline void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); if (oldp) *oldp = old; if (unlikely(!old)) refcount_warn_saturate(r, REFCOUNT_ADD_UAF); else if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_OVF); } /** * refcount_add - add a value to a refcount * @i: the value to add to the refcount * @r: the refcount * * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. */ static inline void refcount_add(int i, refcount_t *r) { __refcount_add(i, r, NULL); } static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) { return __refcount_add_not_zero(1, r, oldp); } /** * refcount_inc_not_zero - increment a refcount unless it is 0 * @r: the refcount to increment * * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED * and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Return: true if the increment was successful, false otherwise */ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) { return __refcount_inc_not_zero(r, NULL); } static inline void __refcount_inc(refcount_t *r, int *oldp) { __refcount_add(1, r, oldp); } /** * refcount_inc - increment a refcount * @r: the refcount to increment * * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller already has a * reference on the object. * * Will WARN if the refcount is 0, as this represents a possible use-after-free * condition. */ static inline void refcount_inc(refcount_t *r) { __refcount_inc(r, NULL); } static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(i, &r->refs); if (oldp) *oldp = old; if (old == i) { smp_acquire__after_ctrl_dep(); return true; } if (unlikely(old < 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF); return false; } /** * refcount_sub_and_test - subtract from a refcount and test if it is 0 * @i: amount to subtract from the refcount * @r: the refcount * * Similar to atomic_dec_and_test(), but it will WARN, return false and * ultimately leak on underflow and will fail to decrement when saturated * at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_dec(), or one of its variants, should instead be used to * decrement a reference count. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) { return __refcount_sub_and_test(i, r, NULL); } static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) { return __refcount_sub_and_test(1, r, oldp); } /** * refcount_dec_and_test - decrement a refcount and test if it is 0 * @r: the refcount * * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_dec_and_test(refcount_t *r) { return __refcount_dec_and_test(r, NULL); } static inline void __refcount_dec(refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(1, &r->refs); if (oldp) *oldp = old; if (unlikely(old <= 1)) refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); } /** * refcount_dec - decrement a refcount * @r: the refcount * * Similar to atomic_dec(), it will WARN on underflow and fail to decrement * when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before. */ static inline void refcount_dec(refcount_t *r) { __refcount_dec(r, NULL); } extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock); extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock); extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags); #endif /* _LINUX_REFCOUNT_H */
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CURRENT_H #define _ASM_X86_CURRENT_H #include <linux/compiler.h> #include <asm/percpu.h> #ifndef __ASSEMBLY__ struct task_struct; DECLARE_PER_CPU(struct task_struct *, current_task); static __always_inline struct task_struct *get_current(void) { return this_cpu_read_stable(current_task); } #define current get_current() #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_CURRENT_H */
8 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 // SPDX-License-Identifier: GPL-2.0-or-later /* * Kernel Probes (KProbes) * kernel/kprobes.c * * Copyright (C) IBM Corporation, 2002, 2004 * * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel * Probes initial implementation (includes suggestions from * Rusty Russell). * 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with * hlists and exceptions notifier as suggested by Andi Kleen. * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes * interface to access function arguments. * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes * exceptions notifier to be first on the priority list. * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi * <prasanna@in.ibm.com> added function-return probes. */ #include <linux/kprobes.h> #include <linux/hash.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/stddef.h> #include <linux/export.h> #include <linux/moduleloader.h> #include <linux/kallsyms.h> #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/sysctl.h> #include <linux/kdebug.h> #include <linux/memory.h> #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/jump_label.h> #include <linux/perf_event.h> #include <linux/static_call.h> #include <asm/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> #include <linux/uaccess.h> #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) static int kprobes_initialized; /* kprobe_table can be accessed by * - Normal hlist traversal and RCU add/del under kprobe_mutex is held. * Or * - RCU hlist traversal under disabling preempt (breakpoint handlers) */ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobes_all_disarmed; /* This protects kprobe_table and optimizing_list */ static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, unsigned int __unused) { return ((kprobe_opcode_t *)(kallsyms_lookup_name(name))); } static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) { return &(kretprobe_table_locks[hash].lock); } /* Blacklist -- list of struct kprobe_blacklist_entry */ static LIST_HEAD(kprobe_blacklist); #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* * kprobe->ainsn.insn points to the copy of the instruction to be * single-stepped. x86_64, POWER4 and above have no-exec support and * stepping on the instruction on a vmalloced/kmalloced/data page * is a recipe for disaster */ struct kprobe_insn_page { struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ struct kprobe_insn_cache *cache; int nused; int ngarbage; char slot_used[]; }; #define KPROBE_INSN_PAGE_SIZE(slots) \ (offsetof(struct kprobe_insn_page, slot_used) + \ (sizeof(char) * (slots))) static int slots_per_page(struct kprobe_insn_cache *c) { return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); } enum kprobe_slot_state { SLOT_CLEAN = 0, SLOT_DIRTY = 1, SLOT_USED = 2, }; void __weak *alloc_insn_page(void) { return module_alloc(PAGE_SIZE); } void __weak free_insn_page(void *page) { module_memfree(page); } struct kprobe_insn_cache kprobe_insn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, .sym = KPROBE_INSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, }; static int collect_garbage_slots(struct kprobe_insn_cache *c); /** * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; kprobe_opcode_t *slot = NULL; /* Since the slot array is not protected by rcu, we need a mutex */ mutex_lock(&c->mutex); retry: rcu_read_lock(); list_for_each_entry_rcu(kip, &c->pages, list) { if (kip->nused < slots_per_page(c)) { int i; for (i = 0; i < slots_per_page(c); i++) { if (kip->slot_used[i] == SLOT_CLEAN) { kip->slot_used[i] = SLOT_USED; kip->nused++; slot = kip->insns + (i * c->insn_size); rcu_read_unlock(); goto out; } } /* kip->nused is broken. Fix it. */ kip->nused = slots_per_page(c); WARN_ON(1); } } rcu_read_unlock(); /* If there are any garbage slots, collect it and try again. */ if (c->nr_garbage && collect_garbage_slots(c) == 0) goto retry; /* All out of space. Need to allocate a new page. */ kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); if (!kip) goto out; /* * Use module_alloc so this page is within +/- 2GB of where the * kernel image and loaded module images reside. This is required * so x86_64 can correctly handle the %rip-relative fixups. */ kip->insns = c->alloc(); if (!kip->insns) { kfree(kip); goto out; } INIT_LIST_HEAD(&kip->list); memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); kip->slot_used[0] = SLOT_USED; kip->nused = 1; kip->ngarbage = 0; kip->cache = c; list_add_rcu(&kip->list, &c->pages); slot = kip->insns; /* Record the perf ksymbol register event after adding the page */ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, PAGE_SIZE, false, c->sym); out: mutex_unlock(&c->mutex); return slot; } /* Return 1 if all garbages are collected, otherwise 0. */ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) { kip->slot_used[idx] = SLOT_CLEAN; kip->nused--; if (kip->nused == 0) { /* * Page is no longer in use. Free it unless * it's the last one. We keep the last one * so as not to have to set it up again the * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { /* * Record perf ksymbol unregister event before removing * the page. */ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, PAGE_SIZE, true, kip->cache->sym); list_del_rcu(&kip->list); synchronize_rcu(); kip->cache->free(kip->insns); kfree(kip); } return 1; } return 0; } static int collect_garbage_slots(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip, *next; /* Ensure no-one is interrupted on the garbages */ synchronize_rcu(); list_for_each_entry_safe(kip, next, &c->pages, list) { int i; if (kip->ngarbage == 0) continue; kip->ngarbage = 0; /* we will collect all garbages */ for (i = 0; i < slots_per_page(c); i++) { if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i)) break; } } c->nr_garbage = 0; return 0; } void __free_insn_slot(struct kprobe_insn_cache *c, kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; long idx; mutex_lock(&c->mutex); rcu_read_lock(); list_for_each_entry_rcu(kip, &c->pages, list) { idx = ((long)slot - (long)kip->insns) / (c->insn_size * sizeof(kprobe_opcode_t)); if (idx >= 0 && idx < slots_per_page(c)) goto out; } /* Could not find this slot. */ WARN_ON(1); kip = NULL; out: rcu_read_unlock(); /* Mark and sweep: this may sleep */ if (kip) { /* Check double free */ WARN_ON(kip->slot_used[idx] != SLOT_USED); if (dirty) { kip->slot_used[idx] = SLOT_DIRTY; kip->ngarbage++; if (++c->nr_garbage > slots_per_page(c)) collect_garbage_slots(c); } else { collect_one_slot(kip, idx); } } mutex_unlock(&c->mutex); } /* * Check given address is on the page of kprobe instruction slots. * This will be used for checking whether the address on a stack * is on a text area or not. */ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr) { struct kprobe_insn_page *kip; bool ret = false; rcu_read_lock(); list_for_each_entry_rcu(kip, &c->pages, list) { if (addr >= (unsigned long)kip->insns && addr < (unsigned long)kip->insns + PAGE_SIZE) { ret = true; break; } } rcu_read_unlock(); return ret; } int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, unsigned long *value, char *type, char *sym) { struct kprobe_insn_page *kip; int ret = -ERANGE; rcu_read_lock(); list_for_each_entry_rcu(kip, &c->pages, list) { if ((*symnum)--) continue; strlcpy(sym, c->sym, KSYM_NAME_LEN); *type = 't'; *value = (unsigned long)kip->insns; ret = 0; break; } rcu_read_unlock(); return ret; } #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, .sym = KPROBE_OPTINSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, }; #endif #endif /* We have preemption disabled.. so it is safe to use __ versions */ static inline void set_kprobe_instance(struct kprobe *kp) { __this_cpu_write(kprobe_instance, kp); } static inline void reset_kprobe_instance(void) { __this_cpu_write(kprobe_instance, NULL); } /* * This routine is called either: * - under the kprobe_mutex - during kprobe_[un]register() * OR * - with preemption disabled - from arch/xxx/kernel/kprobes.c */ struct kprobe *get_kprobe(void *addr) { struct hlist_head *head; struct kprobe *p; head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; hlist_for_each_entry_rcu(p, head, hlist, lockdep_is_held(&kprobe_mutex)) { if (p->addr == addr) return p; } return NULL; } NOKPROBE_SYMBOL(get_kprobe); static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); /* Return true if the kprobe is an aggregator */ static inline int kprobe_aggrprobe(struct kprobe *p) { return p->pre_handler == aggr_pre_handler; } /* Return true(!0) if the kprobe is unused */ static inline int kprobe_unused(struct kprobe *p) { return kprobe_aggrprobe(p) && kprobe_disabled(p) && list_empty(&p->list); } /* * Keep all fields in the kprobe consistent */ static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) { memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn)); } #ifdef CONFIG_OPTPROBES /* NOTE: change this value only with kprobe_mutex held */ static bool kprobes_allow_optimization; /* * Call all pre_handler on the list, but ignores its return value. * This must be called from arch-dep optimized caller. */ void opt_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { if (kp->pre_handler && likely(!kprobe_disabled(kp))) { set_kprobe_instance(kp); kp->pre_handler(kp, regs); } reset_kprobe_instance(); } } NOKPROBE_SYMBOL(opt_pre_handler); /* Free optimized instructions and optimized_kprobe */ static void free_aggr_kprobe(struct kprobe *p) { struct optimized_kprobe *op; op = container_of(p, struct optimized_kprobe, kp); arch_remove_optimized_kprobe(op); arch_remove_kprobe(p); kfree(op); } /* Return true(!0) if the kprobe is ready for optimization. */ static inline int kprobe_optready(struct kprobe *p) { struct optimized_kprobe *op; if (kprobe_aggrprobe(p)) { op = container_of(p, struct optimized_kprobe, kp); return arch_prepared_optinsn(&op->optinsn); } return 0; } /* Return true if the kprobe is disarmed. Note: p must be on hash list */ bool kprobe_disarmed(struct kprobe *p) { struct optimized_kprobe *op; /* If kprobe is not aggr/opt probe, just return kprobe is disabled */ if (!kprobe_aggrprobe(p)) return kprobe_disabled(p); op = container_of(p, struct optimized_kprobe, kp); return kprobe_disabled(p) && list_empty(&op->list); } /* Return true(!0) if the probe is queued on (un)optimizing lists */ static int kprobe_queued(struct kprobe *p) { struct optimized_kprobe *op; if (kprobe_aggrprobe(p)) { op = container_of(p, struct optimized_kprobe, kp); if (!list_empty(&op->list)) return 1; } return 0; } /* * Return an optimized kprobe whose optimizing code replaces * instructions including addr (exclude breakpoint). */ static struct kprobe *get_optimized_kprobe(unsigned long addr) { int i; struct kprobe *p = NULL; struct optimized_kprobe *op; /* Don't check i == 0, since that is a breakpoint case. */ for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++) p = get_kprobe((void *)(addr - i)); if (p && kprobe_optready(p)) { op = container_of(p, struct optimized_kprobe, kp); if (arch_within_optimized_kprobe(op, addr)) return p; } return NULL; } /* Optimization staging list, protected by kprobe_mutex */ static LIST_HEAD(optimizing_list); static LIST_HEAD(unoptimizing_list); static LIST_HEAD(freeing_list); static void kprobe_optimizer(struct work_struct *work); static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); #define OPTIMIZE_DELAY 5 /* * Optimize (replace a breakpoint with a jump) kprobes listed on * optimizing_list. */ static void do_optimize_kprobes(void) { lockdep_assert_held(&text_mutex); /* * The optimization/unoptimization refers online_cpus via * stop_machine() and cpu-hotplug modifies online_cpus. * And same time, text_mutex will be held in cpu-hotplug and here. * This combination can cause a deadlock (cpu-hotplug try to lock * text_mutex but stop_machine can not be done because online_cpus * has been changed) * To avoid this deadlock, caller must have locked cpu hotplug * for preventing cpu-hotplug outside of text_mutex locking. */ lockdep_assert_cpus_held(); /* Optimization never be done when disarmed */ if (kprobes_all_disarmed || !kprobes_allow_optimization || list_empty(&optimizing_list)) return; arch_optimize_kprobes(&optimizing_list); } /* * Unoptimize (replace a jump with a breakpoint and remove the breakpoint * if need) kprobes listed on unoptimizing_list. */ static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; lockdep_assert_held(&text_mutex); /* See comment in do_optimize_kprobes() */ lockdep_assert_cpus_held(); if (!list_empty(&unoptimizing_list)) arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */ list_for_each_entry_safe(op, tmp, &freeing_list, list) { /* Switching from detour code to origin */ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; /* Disarm probes if marked disabled and not gone */ if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp)) arch_disarm_kprobe(&op->kp); if (kprobe_unused(&op->kp)) { /* * Remove unused probes from hash list. After waiting * for synchronization, these probes are reclaimed. * (reclaiming is done by do_free_cleaned_kprobes.) */ hlist_del_rcu(&op->kp.hlist); } else list_del_init(&op->list); } } /* Reclaim all kprobes on the free_list */ static void do_free_cleaned_kprobes(void) { struct optimized_kprobe *op, *tmp; list_for_each_entry_safe(op, tmp, &freeing_list, list) { list_del_init(&op->list); if (WARN_ON_ONCE(!kprobe_unused(&op->kp))) { /* * This must not happen, but if there is a kprobe * still in use, keep it on kprobes hash list. */ continue; } free_aggr_kprobe(&op->kp); } } /* Start optimizer after OPTIMIZE_DELAY passed */ static void kick_kprobe_optimizer(void) { schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); } /* Kprobe jump optimizer */ static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); cpus_read_lock(); mutex_lock(&text_mutex); /* * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) * kprobes before waiting for quiesence period. */ do_unoptimize_kprobes(); /* * Step 2: Wait for quiesence period to ensure all potentially * preempted tasks to have normally scheduled. Because optprobe * may modify multiple instructions, there is a chance that Nth * instruction is preempted. In that case, such tasks can return * to 2nd-Nth byte of jump instruction. This wait is for avoiding it. * Note that on non-preemptive kernel, this is transparently converted * to synchronoze_sched() to wait for all interrupts to have completed. */ synchronize_rcu_tasks(); /* Step 3: Optimize kprobes after quiesence period */ do_optimize_kprobes(); /* Step 4: Free cleaned kprobes after quiesence period */ do_free_cleaned_kprobes(); mutex_unlock(&text_mutex); cpus_read_unlock(); /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) kick_kprobe_optimizer(); mutex_unlock(&kprobe_mutex); } /* Wait for completing optimization and unoptimization */ void wait_for_kprobe_optimizer(void) { mutex_lock(&kprobe_mutex); while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { mutex_unlock(&kprobe_mutex); /* this will also make optimizing_work execute immmediately */ flush_delayed_work(&optimizing_work); /* @optimizing_work might not have been queued yet, relax */ cpu_relax(); mutex_lock(&kprobe_mutex); } mutex_unlock(&kprobe_mutex); } bool optprobe_queued_unopt(struct optimized_kprobe *op) { struct optimized_kprobe *_op; list_for_each_entry(_op, &unoptimizing_list, list) { if (op == _op) return true; } return false; } /* Optimize kprobe if p is ready to be optimized */ static void optimize_kprobe(struct kprobe *p) { struct optimized_kprobe *op; /* Check if the kprobe is disabled or not ready for optimization. */ if (!kprobe_optready(p) || !kprobes_allow_optimization || (kprobe_disabled(p) || kprobes_all_disarmed)) return; /* kprobes with post_handler can not be optimized */ if (p->post_handler) return; op = container_of(p, struct optimized_kprobe, kp); /* Check there is no other kprobes at the optimized instructions */ if (arch_check_optimized_kprobe(op) < 0) return; /* Check if it is already optimized. */ if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) { if (optprobe_queued_unopt(op)) { /* This is under unoptimizing. Just dequeue the probe */ list_del_init(&op->list); } return; } op->kp.flags |= KPROBE_FLAG_OPTIMIZED; /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */ if (WARN_ON_ONCE(!list_empty(&op->list))) return; list_add(&op->list, &optimizing_list); kick_kprobe_optimizer(); } /* Short cut to direct unoptimizing */ static void force_unoptimize_kprobe(struct optimized_kprobe *op) { lockdep_assert_cpus_held(); arch_unoptimize_kprobe(op); op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; } /* Unoptimize a kprobe if p is optimized */ static void unoptimize_kprobe(struct kprobe *p, bool force) { struct optimized_kprobe *op; if (!kprobe_aggrprobe(p) || kprobe_disarmed(p)) return; /* This is not an optprobe nor optimized */ op = container_of(p, struct optimized_kprobe, kp); if (!kprobe_optimized(p)) return; if (!list_empty(&op->list)) { if (optprobe_queued_unopt(op)) { /* Queued in unoptimizing queue */ if (force) { /* * Forcibly unoptimize the kprobe here, and queue it * in the freeing list for release afterwards. */ force_unoptimize_kprobe(op); list_move(&op->list, &freeing_list); } } else { /* Dequeue from the optimizing queue */ list_del_init(&op->list); op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; } return; } /* Optimized kprobe case */ if (force) { /* Forcibly update the code: this is a special case */ force_unoptimize_kprobe(op); } else { list_add(&op->list, &unoptimizing_list); kick_kprobe_optimizer(); } } /* Cancel unoptimizing for reusing */ static int reuse_unused_kprobe(struct kprobe *ap) { struct optimized_kprobe *op; /* * Unused kprobe MUST be on the way of delayed unoptimizing (means * there is still a relative jump) and disabled. */ op = container_of(ap, struct optimized_kprobe, kp); WARN_ON_ONCE(list_empty(&op->list)); /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ if (!kprobe_optready(ap)) return -EINVAL; optimize_kprobe(ap); return 0; } /* Remove optimized instructions */ static void kill_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; op = container_of(p, struct optimized_kprobe, kp); if (!list_empty(&op->list)) /* Dequeue from the (un)optimization queue */ list_del_init(&op->list); op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; if (kprobe_unused(p)) { /* * Unused kprobe is on unoptimizing or freeing list. We move it * to freeing_list and let the kprobe_optimizer() remove it from * the kprobe hash list and free it. */ if (optprobe_queued_unopt(op)) list_move(&op->list, &freeing_list); } /* Don't touch the code, because it is already freed. */ arch_remove_optimized_kprobe(op); } static inline void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) { if (!kprobe_ftrace(p)) arch_prepare_optimized_kprobe(op, p); } /* Try to prepare optimized instructions */ static void prepare_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; op = container_of(p, struct optimized_kprobe, kp); __prepare_optimized_kprobe(op, p); } /* Allocate new optimized_kprobe and try to prepare optimized instructions */ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { struct optimized_kprobe *op; op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL); if (!op) return NULL; INIT_LIST_HEAD(&op->list); op->kp.addr = p->addr; __prepare_optimized_kprobe(op, p); return &op->kp; } static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); /* * Prepare an optimized_kprobe and optimize it * NOTE: p must be a normal registered kprobe */ static void try_to_optimize_kprobe(struct kprobe *p) { struct kprobe *ap; struct optimized_kprobe *op; /* Impossible to optimize ftrace-based kprobe */ if (kprobe_ftrace(p)) return; /* For preparing optimization, jump_label_text_reserved() is called */ cpus_read_lock(); jump_label_lock(); mutex_lock(&text_mutex); ap = alloc_aggr_kprobe(p); if (!ap) goto out; op = container_of(ap, struct optimized_kprobe, kp); if (!arch_prepared_optinsn(&op->optinsn)) { /* If failed to setup optimizing, fallback to kprobe */ arch_remove_optimized_kprobe(op); kfree(op); goto out; } init_aggr_kprobe(ap, p); optimize_kprobe(ap); /* This just kicks optimizer thread */ out: mutex_unlock(&text_mutex); jump_label_unlock(); cpus_read_unlock(); } static void optimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; unsigned int i; mutex_lock(&kprobe_mutex); /* If optimization is already allowed, just return */ if (kprobes_allow_optimization) goto out; cpus_read_lock(); kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry(p, head, hlist) if (!kprobe_disabled(p)) optimize_kprobe(p); } cpus_read_unlock(); printk(KERN_INFO "Kprobes globally optimized\n"); out: mutex_unlock(&kprobe_mutex); } #ifdef CONFIG_SYSCTL static void unoptimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; unsigned int i; mutex_lock(&kprobe_mutex); /* If optimization is already prohibited, just return */ if (!kprobes_allow_optimization) { mutex_unlock(&kprobe_mutex); return; } cpus_read_lock(); kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry(p, head, hlist) { if (!kprobe_disabled(p)) unoptimize_kprobe(p, false); } } cpus_read_unlock(); mutex_unlock(&kprobe_mutex); /* Wait for unoptimizing completion */ wait_for_kprobe_optimizer(); printk(KERN_INFO "Kprobes globally unoptimized\n"); } static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret; mutex_lock(&kprobe_sysctl_mutex); sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (sysctl_kprobes_optimization) optimize_all_kprobes(); else unoptimize_all_kprobes(); mutex_unlock(&kprobe_sysctl_mutex); return ret; } #endif /* CONFIG_SYSCTL */ /* Put a breakpoint for a probe. Must be called with text_mutex locked */ static void __arm_kprobe(struct kprobe *p) { struct kprobe *_p; /* Check collision with other optimized kprobes */ _p = get_optimized_kprobe((unsigned long)p->addr); if (unlikely(_p)) /* Fallback to unoptimized kprobe */ unoptimize_kprobe(_p, true); arch_arm_kprobe(p); optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ } /* Remove the breakpoint of a probe. Must be called with text_mutex locked */ static void __disarm_kprobe(struct kprobe *p, bool reopt) { struct kprobe *_p; /* Try to unoptimize */ unoptimize_kprobe(p, kprobes_all_disarmed); if (!kprobe_queued(p)) { arch_disarm_kprobe(p); /* If another kprobe was blocked, optimize it. */ _p = get_optimized_kprobe((unsigned long)p->addr); if (unlikely(_p) && reopt) optimize_kprobe(_p); } /* TODO: reoptimize others after unoptimized this probe */ } #else /* !CONFIG_OPTPROBES */ #define optimize_kprobe(p) do {} while (0) #define unoptimize_kprobe(p, f) do {} while (0) #define kill_optimized_kprobe(p) do {} while (0) #define prepare_optimized_kprobe(p) do {} while (0) #define try_to_optimize_kprobe(p) do {} while (0) #define __arm_kprobe(p) arch_arm_kprobe(p) #define __disarm_kprobe(p, o) arch_disarm_kprobe(p) #define kprobe_disarmed(p) kprobe_disabled(p) #define wait_for_kprobe_optimizer() do {} while (0) static int reuse_unused_kprobe(struct kprobe *ap) { /* * If the optimized kprobe is NOT supported, the aggr kprobe is * released at the same time that the last aggregated kprobe is * unregistered. * Thus there should be no chance to reuse unused kprobe. */ printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); return -EINVAL; } static void free_aggr_kprobe(struct kprobe *p) { arch_remove_kprobe(p); kfree(p); } static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { return kzalloc(sizeof(struct kprobe), GFP_KERNEL); } #endif /* CONFIG_OPTPROBES */ #ifdef CONFIG_KPROBES_ON_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { .func = kprobe_ftrace_handler, .flags = FTRACE_OPS_FL_SAVE_REGS, }; static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { .func = kprobe_ftrace_handler, .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY, }; static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; /* Must ensure p->addr is really on ftrace */ static int prepare_kprobe(struct kprobe *p) { if (!kprobe_ftrace(p)) return arch_prepare_kprobe(p); return arch_prepare_kprobe_ftrace(p); } /* Caller must lock kprobe_mutex */ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { int ret = 0; ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0); if (ret) { pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n", p->addr, ret); return ret; } if (*cnt == 0) { ret = register_ftrace_function(ops); if (ret) { pr_debug("Failed to init kprobe-ftrace (%d)\n", ret); goto err_ftrace; } } (*cnt)++; return ret; err_ftrace: /* * At this point, sinec ops is not registered, we should be sefe from * registering empty filter. */ ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); return ret; } static int arm_kprobe_ftrace(struct kprobe *p) { bool ipmodify = (p->post_handler != NULL); return __arm_kprobe_ftrace(p, ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } /* Caller must lock kprobe_mutex */ static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { int ret = 0; if (*cnt == 1) { ret = unregister_ftrace_function(ops); if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret)) return ret; } (*cnt)--; ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n", p->addr, ret); return ret; } static int disarm_kprobe_ftrace(struct kprobe *p) { bool ipmodify = (p->post_handler != NULL); return __disarm_kprobe_ftrace(p, ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } #else /* !CONFIG_KPROBES_ON_FTRACE */ static inline int prepare_kprobe(struct kprobe *p) { return arch_prepare_kprobe(p); } static inline int arm_kprobe_ftrace(struct kprobe *p) { return -ENODEV; } static inline int disarm_kprobe_ftrace(struct kprobe *p) { return -ENODEV; } #endif /* Arm a kprobe with text_mutex */ static int arm_kprobe(struct kprobe *kp) { if (unlikely(kprobe_ftrace(kp))) return arm_kprobe_ftrace(kp); cpus_read_lock(); mutex_lock(&text_mutex); __arm_kprobe(kp); mutex_unlock(&text_mutex); cpus_read_unlock(); return 0; } /* Disarm a kprobe with text_mutex */ static int disarm_kprobe(struct kprobe *kp, bool reopt) { if (unlikely(kprobe_ftrace(kp))) return disarm_kprobe_ftrace(kp); cpus_read_lock(); mutex_lock(&text_mutex); __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); cpus_read_unlock(); return 0; } /* * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list */ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { if (kp->pre_handler && likely(!kprobe_disabled(kp))) { set_kprobe_instance(kp); if (kp->pre_handler(kp, regs)) return 1; } reset_kprobe_instance(); } return 0; } NOKPROBE_SYMBOL(aggr_pre_handler); static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { if (kp->post_handler && likely(!kprobe_disabled(kp))) { set_kprobe_instance(kp); kp->post_handler(kp, regs, flags); reset_kprobe_instance(); } } } NOKPROBE_SYMBOL(aggr_post_handler); static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) { struct kprobe *cur = __this_cpu_read(kprobe_instance); /* * if we faulted "during" the execution of a user specified * probe handler, invoke just that probe's fault handler */ if (cur && cur->fault_handler) { if (cur->fault_handler(cur, regs, trapnr)) return 1; } return 0; } NOKPROBE_SYMBOL(aggr_fault_handler); /* Walks the list and increments nmissed count for multiprobe case */ void kprobes_inc_nmissed_count(struct kprobe *p) { struct kprobe *kp; if (!kprobe_aggrprobe(p)) { p->nmissed++; } else { list_for_each_entry_rcu(kp, &p->list, list) kp->nmissed++; } return; } NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); static void recycle_rp_inst(struct kretprobe_instance *ri) { struct kretprobe *rp = ri->rp; /* remove rp inst off the rprobe_inst_table */ hlist_del(&ri->hlist); INIT_HLIST_NODE(&ri->hlist); if (likely(rp)) { raw_spin_lock(&rp->lock); hlist_add_head(&ri->hlist, &rp->free_instances); raw_spin_unlock(&rp->lock); } else kfree_rcu(ri, rcu); } NOKPROBE_SYMBOL(recycle_rp_inst); static void kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) __acquires(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); raw_spinlock_t *hlist_lock; *head = &kretprobe_inst_table[hash]; hlist_lock = kretprobe_table_lock_ptr(hash); /* * Nested is a workaround that will soon not be needed. * There's other protections that make sure the same lock * is not taken on the same CPU that lockdep is unaware of. * Differentiate when it is taken in NMI context. */ raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi()); } NOKPROBE_SYMBOL(kretprobe_hash_lock); static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) __acquires(hlist_lock) { raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); /* * Nested is a workaround that will soon not be needed. * There's other protections that make sure the same lock * is not taken on the same CPU that lockdep is unaware of. * Differentiate when it is taken in NMI context. */ raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi()); } NOKPROBE_SYMBOL(kretprobe_table_lock); static void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) __releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); raw_spinlock_t *hlist_lock; hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_unlock_irqrestore(hlist_lock, *flags); } NOKPROBE_SYMBOL(kretprobe_hash_unlock); static void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) __releases(hlist_lock) { raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_unlock_irqrestore(hlist_lock, *flags); } NOKPROBE_SYMBOL(kretprobe_table_unlock); static struct kprobe kprobe_busy = { .addr = (void *) get_kprobe, }; void kprobe_busy_begin(void) { struct kprobe_ctlblk *kcb; preempt_disable(); __this_cpu_write(current_kprobe, &kprobe_busy); kcb = get_kprobe_ctlblk(); kcb->kprobe_status = KPROBE_HIT_ACTIVE; } void kprobe_busy_end(void) { __this_cpu_write(current_kprobe, NULL); preempt_enable(); } /* * This function is called from finish_task_switch when task tk becomes dead, * so that we can recycle any function-return probe instances associated * with this task. These left over instances represent probed functions * that have been called but will never return. */ void kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; struct hlist_head *head; struct hlist_node *tmp; unsigned long hash, flags = 0; if (unlikely(!kprobes_initialized)) /* Early boot. kretprobe_table_locks not yet initialized. */ return; kprobe_busy_begin(); hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); hlist_for_each_entry_safe(ri, tmp, head, hlist) { if (ri->task == tk) recycle_rp_inst(ri); } kretprobe_table_unlock(hash, &flags); kprobe_busy_end(); } NOKPROBE_SYMBOL(kprobe_flush_task); static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; struct hlist_node *next; hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) { hlist_del(&ri->hlist); kfree(ri); } } static void cleanup_rp_inst(struct kretprobe *rp) { unsigned long flags, hash; struct kretprobe_instance *ri; struct hlist_node *next; struct hlist_head *head; /* To avoid recursive kretprobe by NMI, set kprobe busy here */ kprobe_busy_begin(); for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { kretprobe_table_lock(hash, &flags); head = &kretprobe_inst_table[hash]; hlist_for_each_entry_safe(ri, next, head, hlist) { if (ri->rp == rp) ri->rp = NULL; } kretprobe_table_unlock(hash, &flags); } kprobe_busy_end(); free_rp_inst(rp); } NOKPROBE_SYMBOL(cleanup_rp_inst); /* Add the new probe to ap->list */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { if (p->post_handler) unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ list_add_rcu(&p->list, &ap->list); if (p->post_handler && !ap->post_handler) ap->post_handler = aggr_post_handler; return 0; } /* * Fill in the required fields of the "manager kprobe". Replace the * earlier kprobe in the hlist with the manager kprobe */ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { /* Copy p's insn slot to ap */ copy_kprobe(p, ap); flush_insn_slot(ap); ap->addr = p->addr; ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; /* We don't care the kprobe which has gone. */ if (p->post_handler && !kprobe_gone(p)) ap->post_handler = aggr_post_handler; INIT_LIST_HEAD(&ap->list); INIT_HLIST_NODE(&ap->hlist); list_add_rcu(&p->list, &ap->list); hlist_replace_rcu(&p->hlist, &ap->hlist); } /* * This is the second or subsequent kprobe at the address - handle * the intricacies */ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) { int ret = 0; struct kprobe *ap = orig_p; cpus_read_lock(); /* For preparing optimization, jump_label_text_reserved() is called */ jump_label_lock(); mutex_lock(&text_mutex); if (!kprobe_aggrprobe(orig_p)) { /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ ap = alloc_aggr_kprobe(orig_p); if (!ap) { ret = -ENOMEM; goto out; } init_aggr_kprobe(ap, orig_p); } else if (kprobe_unused(ap)) { /* This probe is going to die. Rescue it */ ret = reuse_unused_kprobe(ap); if (ret) goto out; } if (kprobe_gone(ap)) { /* * Attempting to insert new probe at the same location that * had a probe in the module vaddr area which already * freed. So, the instruction slot has already been * released. We need a new slot for the new probe. */ ret = arch_prepare_kprobe(ap); if (ret) /* * Even if fail to allocate new slot, don't need to * free aggr_probe. It will be used next time, or * freed by unregister_kprobe. */ goto out; /* Prepare optimized instructions if possible. */ prepare_optimized_kprobe(ap); /* * Clear gone flag to prevent allocating new slot again, and * set disabled flag because it is not armed yet. */ ap->flags = (ap->flags & ~KPROBE_FLAG_GONE) | KPROBE_FLAG_DISABLED; } /* Copy ap's insn slot to p */ copy_kprobe(ap, p); ret = add_new_kprobe(ap, p); out: mutex_unlock(&text_mutex); jump_label_unlock(); cpus_read_unlock(); if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; if (!kprobes_all_disarmed) { /* Arm the breakpoint again. */ ret = arm_kprobe(ap); if (ret) { ap->flags |= KPROBE_FLAG_DISABLED; list_del_rcu(&p->list); synchronize_rcu(); } } } return ret; } bool __weak arch_within_kprobe_blacklist(unsigned long addr) { /* The __kprobes marked functions and entry code must not be probed */ return addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end; } static bool __within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; if (arch_within_kprobe_blacklist(addr)) return true; /* * If there exists a kprobe_blacklist, verify and * fail any probe registration in the prohibited area */ list_for_each_entry(ent, &kprobe_blacklist, list) { if (addr >= ent->start_addr && addr < ent->end_addr) return true; } return false; } bool within_kprobe_blacklist(unsigned long addr) { char symname[KSYM_NAME_LEN], *p; if (__within_kprobe_blacklist(addr)) return true; /* Check if the address is on a suffixed-symbol */ if (!lookup_symbol_name(addr, symname)) { p = strchr(symname, '.'); if (!p) return false; *p = '\0'; addr = (unsigned long)kprobe_lookup_name(symname, 0); if (addr) return __within_kprobe_blacklist(addr); } return false; } /* * If we have a symbol_name argument, look it up and add the offset field * to it. This way, we can specify a relative address to a symbol. * This returns encoded errors if it fails to look up symbol or invalid * combination of parameters. */ static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name, unsigned int offset) { if ((symbol_name && addr) || (!symbol_name && !addr)) goto invalid; if (symbol_name) { addr = kprobe_lookup_name(symbol_name, offset); if (!addr) return ERR_PTR(-ENOENT); } addr = (kprobe_opcode_t *)(((char *)addr) + offset); if (addr) return addr; invalid: return ERR_PTR(-EINVAL); } static kprobe_opcode_t *kprobe_addr(struct kprobe *p) { return _kprobe_addr(p->addr, p->symbol_name, p->offset); } /* Check passed kprobe is valid and return kprobe in kprobe_table. */ static struct kprobe *__get_valid_kprobe(struct kprobe *p) { struct kprobe *ap, *list_p; lockdep_assert_held(&kprobe_mutex); ap = get_kprobe(p->addr); if (unlikely(!ap)) return NULL; if (p != ap) { list_for_each_entry(list_p, &ap->list, list) if (list_p == p) /* kprobe p is a valid probe */ goto valid; return NULL; } valid: return ap; } /* Return error if the kprobe is being re-registered */ static inline int check_kprobe_rereg(struct kprobe *p) { int ret = 0; mutex_lock(&kprobe_mutex); if (__get_valid_kprobe(p)) ret = -EINVAL; mutex_unlock(&kprobe_mutex); return ret; } int __weak arch_check_ftrace_location(struct kprobe *p) { unsigned long ftrace_addr; ftrace_addr = ftrace_location((unsigned long)p->addr); if (ftrace_addr) { #ifdef CONFIG_KPROBES_ON_FTRACE /* Given address is not on the instruction boundary */ if ((unsigned long)p->addr != ftrace_addr) return -EILSEQ; p->flags |= KPROBE_FLAG_FTRACE; #else /* !CONFIG_KPROBES_ON_FTRACE */ return -EINVAL; #endif } return 0; } static bool is_cfi_preamble_symbol(unsigned long addr) { char symbuf[KSYM_NAME_LEN]; if (lookup_symbol_name(addr, symbuf)) return false; return str_has_prefix("__cfi_", symbuf) || str_has_prefix("__pfx_", symbuf); } static int check_kprobe_address_safe(struct kprobe *p, struct module **probed_mod) { int ret; ret = arch_check_ftrace_location(p); if (ret) return ret; jump_label_lock(); preempt_disable(); /* Ensure it is not in reserved area nor out of text */ if (!(core_kernel_text((unsigned long) p->addr) || is_module_text_address((unsigned long) p->addr)) || in_gate_area_no_mm((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || find_bug((unsigned long)p->addr) || is_cfi_preamble_symbol((unsigned long)p->addr)) { ret = -EINVAL; goto out; } /* Check if are we probing a module */ *probed_mod = __module_text_address((unsigned long) p->addr); if (*probed_mod) { /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ if (unlikely(!try_module_get(*probed_mod))) { ret = -ENOENT; goto out; } /* * If the module freed .init.text, we couldn't insert * kprobes in there. */ if (within_module_init((unsigned long)p->addr, *probed_mod) && (*probed_mod)->state != MODULE_STATE_COMING) { module_put(*probed_mod); *probed_mod = NULL; ret = -ENOENT; } } out: preempt_enable(); jump_label_unlock(); return ret; } int register_kprobe(struct kprobe *p) { int ret; struct kprobe *old_p; struct module *probed_mod; kprobe_opcode_t *addr; /* Adjust probe address from symbol */ addr = kprobe_addr(p); if (IS_ERR(addr)) return PTR_ERR(addr); p->addr = addr; ret = check_kprobe_rereg(p); if (ret) return ret; /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ p->flags &= KPROBE_FLAG_DISABLED; p->nmissed = 0; INIT_LIST_HEAD(&p->list); ret = check_kprobe_address_safe(p, &probed_mod); if (ret) return ret; mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); if (old_p) { /* Since this may unoptimize old_p, locking text_mutex. */ ret = register_aggr_kprobe(old_p, p); goto out; } cpus_read_lock(); /* Prevent text modification */ mutex_lock(&text_mutex); ret = prepare_kprobe(p); mutex_unlock(&text_mutex); cpus_read_unlock(); if (ret) goto out; INIT_HLIST_NODE(&p->hlist); hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); if (!kprobes_all_disarmed && !kprobe_disabled(p)) { ret = arm_kprobe(p); if (ret) { hlist_del_rcu(&p->hlist); synchronize_rcu(); goto out; } } /* Try to optimize kprobe */ try_to_optimize_kprobe(p); out: mutex_unlock(&kprobe_mutex); if (probed_mod) module_put(probed_mod); return ret; } EXPORT_SYMBOL_GPL(register_kprobe); /* Check if all probes on the aggrprobe are disabled */ static int aggr_kprobe_disabled(struct kprobe *ap) { struct kprobe *kp; lockdep_assert_held(&kprobe_mutex); list_for_each_entry(kp, &ap->list, list) if (!kprobe_disabled(kp)) /* * There is an active probe on the list. * We can't disable this ap. */ return 0; return 1; } /* Disable one kprobe: Make sure called under kprobe_mutex is locked */ static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; int ret; /* Get an original kprobe for return */ orig_p = __get_valid_kprobe(p); if (unlikely(orig_p == NULL)) return ERR_PTR(-EINVAL); if (!kprobe_disabled(p)) { /* Disable probe if it is a child probe */ if (p != orig_p) p->flags |= KPROBE_FLAG_DISABLED; /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { /* * Don't be lazy here. Even if 'kprobes_all_disarmed' * is false, 'orig_p' might not have been armed yet. * Note arm_all_kprobes() __tries__ to arm all kprobes * on the best effort basis. */ if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) { ret = disarm_kprobe(orig_p, true); if (ret) { p->flags &= ~KPROBE_FLAG_DISABLED; return ERR_PTR(ret); } } orig_p->flags |= KPROBE_FLAG_DISABLED; } } return orig_p; } /* * Unregister a kprobe without a scheduler synchronization. */ static int __unregister_kprobe_top(struct kprobe *p) { struct kprobe *ap, *list_p; /* Disable kprobe. This will disarm it if needed. */ ap = __disable_kprobe(p); if (IS_ERR(ap)) return PTR_ERR(ap); if (ap == p) /* * This probe is an independent(and non-optimized) kprobe * (not an aggrprobe). Remove from the hash list. */ goto disarmed; /* Following process expects this probe is an aggrprobe */ WARN_ON(!kprobe_aggrprobe(ap)); if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) /* * !disarmed could be happen if the probe is under delayed * unoptimizing. */ goto disarmed; else { /* If disabling probe has special handlers, update aggrprobe */ if (p->post_handler && !kprobe_gone(p)) { list_for_each_entry(list_p, &ap->list, list) { if ((list_p != p) && (list_p->post_handler)) goto noclean; } /* * For the kprobe-on-ftrace case, we keep the * post_handler setting to identify this aggrprobe * armed with kprobe_ipmodify_ops. */ if (!kprobe_ftrace(ap)) ap->post_handler = NULL; } noclean: /* * Remove from the aggrprobe: this path will do nothing in * __unregister_kprobe_bottom(). */ list_del_rcu(&p->list); if (!kprobe_disabled(ap) && !kprobes_all_disarmed) /* * Try to optimize this probe again, because post * handler may have been changed. */ optimize_kprobe(ap); } return 0; disarmed: hlist_del_rcu(&ap->hlist); return 0; } static void __unregister_kprobe_bottom(struct kprobe *p) { struct kprobe *ap; if (list_empty(&p->list)) /* This is an independent kprobe */ arch_remove_kprobe(p); else if (list_is_singular(&p->list)) { /* This is the last child of an aggrprobe */ ap = list_entry(p->list.next, struct kprobe, list); list_del(&p->list); free_aggr_kprobe(ap); } /* Otherwise, do nothing. */ } int register_kprobes(struct kprobe **kps, int num) { int i, ret = 0; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { ret = register_kprobe(kps[i]); if (ret < 0) { if (i > 0) unregister_kprobes(kps, i); break; } } return ret; } EXPORT_SYMBOL_GPL(register_kprobes); void unregister_kprobe(struct kprobe *p) { unregister_kprobes(&p, 1); } EXPORT_SYMBOL_GPL(unregister_kprobe); void unregister_kprobes(struct kprobe **kps, int num) { int i; if (num <= 0) return; mutex_lock(&kprobe_mutex); for (i = 0; i < num; i++) if (__unregister_kprobe_top(kps[i]) < 0) kps[i]->addr = NULL; mutex_unlock(&kprobe_mutex); synchronize_rcu(); for (i = 0; i < num; i++) if (kps[i]->addr) __unregister_kprobe_bottom(kps[i]); } EXPORT_SYMBOL_GPL(unregister_kprobes); int __weak kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { return NOTIFY_DONE; } NOKPROBE_SYMBOL(kprobe_exceptions_notify); static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, .priority = 0x7fffffff /* we need to be notified first */ }; unsigned long __weak arch_deref_entry_point(void *entry) { return (unsigned long)entry; } #ifdef CONFIG_KRETPROBES unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, void *trampoline_address, void *frame_pointer) { struct kretprobe_instance *ri = NULL, *last = NULL; struct hlist_head *head; struct hlist_node *tmp; unsigned long flags; kprobe_opcode_t *correct_ret_addr = NULL; bool skipped = false; kretprobe_hash_lock(current, &head, &flags); /* * It is possible to have multiple instances associated with a given * task either because multiple functions in the call path have * return probes installed on them, and/or more than one * return probe was registered for a target function. * * We can handle this because: * - instances are always pushed into the head of the list * - when multiple return probes are registered for the same * function, the (chronologically) first instance's ret_addr * will be the real return address, and all the rest will * point to kretprobe_trampoline. */ hlist_for_each_entry(ri, head, hlist) { if (ri->task != current) /* another task is sharing our hash bucket */ continue; /* * Return probes must be pushed on this hash list correct * order (same as return order) so that it can be popped * correctly. However, if we find it is pushed it incorrect * order, this means we find a function which should not be * probed, because the wrong order entry is pushed on the * path of processing other kretprobe itself. */ if (ri->fp != frame_pointer) { if (!skipped) pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); skipped = true; continue; } correct_ret_addr = ri->ret_addr; if (skipped) pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", ri->rp->kp.addr); if (correct_ret_addr != trampoline_address) /* * This is the real return address. Any other * instances associated with this task are for * other calls deeper on the call stack */ break; } BUG_ON(!correct_ret_addr || (correct_ret_addr == trampoline_address)); last = ri; hlist_for_each_entry_safe(ri, tmp, head, hlist) { if (ri->task != current) /* another task is sharing our hash bucket */ continue; if (ri->fp != frame_pointer) continue; if (ri->rp && ri->rp->handler) { struct kprobe *prev = kprobe_running(); __this_cpu_write(current_kprobe, &ri->rp->kp); ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); __this_cpu_write(current_kprobe, prev); } recycle_rp_inst(ri); if (ri == last) break; } kretprobe_hash_unlock(current, &flags); return (unsigned long)correct_ret_addr; } NOKPROBE_SYMBOL(__kretprobe_trampoline_handler) /* * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. */ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); unsigned long hash, flags = 0; struct kretprobe_instance *ri; /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); /* * Nested is a workaround that will soon not be needed. * There's other protections that make sure the same lock * is not taken on the same CPU that lockdep is unaware of. */ raw_spin_lock_irqsave_nested(&rp->lock, flags, 1); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); hlist_del(&ri->hlist); raw_spin_unlock_irqrestore(&rp->lock, flags); ri->rp = rp; ri->task = current; if (rp->entry_handler && rp->entry_handler(ri, regs)) { raw_spin_lock_irqsave_nested(&rp->lock, flags, 1); hlist_add_head(&ri->hlist, &rp->free_instances); raw_spin_unlock_irqrestore(&rp->lock, flags); return 0; } arch_prepare_kretprobe(ri, regs); /* XXX(hch): why is there no hlist_move_head? */ INIT_HLIST_NODE(&ri->hlist); kretprobe_table_lock(hash, &flags); hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]); kretprobe_table_unlock(hash, &flags); } else { rp->nmissed++; raw_spin_unlock_irqrestore(&rp->lock, flags); } return 0; } NOKPROBE_SYMBOL(pre_handler_kretprobe); bool __weak arch_kprobe_on_func_entry(unsigned long offset) { return !offset; } /** * kprobe_on_func_entry() -- check whether given address is function entry * @addr: Target address * @sym: Target symbol name * @offset: The offset from the symbol or the address * * This checks whether the given @addr+@offset or @sym+@offset is on the * function entry address or not. * This returns 0 if it is the function entry, or -EINVAL if it is not. * And also it returns -ENOENT if it fails the symbol or address lookup. * Caller must pass @addr or @sym (either one must be NULL), or this * returns -EINVAL. */ int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) { kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); if (IS_ERR(kp_addr)) return PTR_ERR(kp_addr); if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset)) return -ENOENT; if (!arch_kprobe_on_func_entry(offset)) return -EINVAL; return 0; } int register_kretprobe(struct kretprobe *rp) { int ret; struct kretprobe_instance *inst; int i; void *addr; ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset); if (ret) return ret; /* If only rp->kp.addr is specified, check reregistering kprobes */ if (rp->kp.addr && check_kprobe_rereg(&rp->kp)) return -EINVAL; if (kretprobe_blacklist_size) { addr = kprobe_addr(&rp->kp); if (IS_ERR(addr)) return PTR_ERR(addr); for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { if (kretprobe_blacklist[i].addr == addr) return -EINVAL; } } if (rp->data_size > KRETPROBE_MAX_DATA_SIZE) return -E2BIG; rp->kp.pre_handler = pre_handler_kretprobe; rp->kp.post_handler = NULL; rp->kp.fault_handler = NULL; /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { #ifdef CONFIG_PREEMPTION rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); #else rp->maxactive = num_possible_cpus(); #endif } raw_spin_lock_init(&rp->lock); INIT_HLIST_HEAD(&rp->free_instances); for (i = 0; i < rp->maxactive; i++) { inst = kmalloc(sizeof(struct kretprobe_instance) + rp->data_size, GFP_KERNEL); if (inst == NULL) { free_rp_inst(rp); return -ENOMEM; } INIT_HLIST_NODE(&inst->hlist); hlist_add_head(&inst->hlist, &rp->free_instances); } rp->nmissed = 0; /* Establish function entry probe point */ ret = register_kprobe(&rp->kp); if (ret != 0) free_rp_inst(rp); return ret; } EXPORT_SYMBOL_GPL(register_kretprobe); int register_kretprobes(struct kretprobe **rps, int num) { int ret = 0, i; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { ret = register_kretprobe(rps[i]); if (ret < 0) { if (i > 0) unregister_kretprobes(rps, i); break; } } return ret; } EXPORT_SYMBOL_GPL(register_kretprobes); void unregister_kretprobe(struct kretprobe *rp) { unregister_kretprobes(&rp, 1); } EXPORT_SYMBOL_GPL(unregister_kretprobe); void unregister_kretprobes(struct kretprobe **rps, int num) { int i; if (num <= 0) return; mutex_lock(&kprobe_mutex); for (i = 0; i < num; i++) if (__unregister_kprobe_top(&rps[i]->kp) < 0) rps[i]->kp.addr = NULL; mutex_unlock(&kprobe_mutex); synchronize_rcu(); for (i = 0; i < num; i++) { if (rps[i]->kp.addr) { __unregister_kprobe_bottom(&rps[i]->kp); cleanup_rp_inst(rps[i]); } } } EXPORT_SYMBOL_GPL(unregister_kretprobes); #else /* CONFIG_KRETPROBES */ int register_kretprobe(struct kretprobe *rp) { return -ENOSYS; } EXPORT_SYMBOL_GPL(register_kretprobe); int register_kretprobes(struct kretprobe **rps, int num) { return -ENOSYS; } EXPORT_SYMBOL_GPL(register_kretprobes); void unregister_kretprobe(struct kretprobe *rp) { } EXPORT_SYMBOL_GPL(unregister_kretprobe); void unregister_kretprobes(struct kretprobe **rps, int num) { } EXPORT_SYMBOL_GPL(unregister_kretprobes); static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { return 0; } NOKPROBE_SYMBOL(pre_handler_kretprobe); #endif /* CONFIG_KRETPROBES */ /* Set the kprobe gone and remove its instruction buffer. */ static void kill_kprobe(struct kprobe *p) { struct kprobe *kp; lockdep_assert_held(&kprobe_mutex); if (WARN_ON_ONCE(kprobe_gone(p))) return; p->flags |= KPROBE_FLAG_GONE; if (kprobe_aggrprobe(p)) { /* * If this is an aggr_kprobe, we have to list all the * chained probes and mark them GONE. */ list_for_each_entry(kp, &p->list, list) kp->flags |= KPROBE_FLAG_GONE; p->post_handler = NULL; kill_optimized_kprobe(p); } /* * Here, we can remove insn_slot safely, because no thread calls * the original probed function (which will be freed soon) any more. */ arch_remove_kprobe(p); /* * The module is going away. We should disarm the kprobe which * is using ftrace, because ftrace framework is still available at * MODULE_STATE_GOING notification. */ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) disarm_kprobe_ftrace(p); } /* Disable one kprobe */ int disable_kprobe(struct kprobe *kp) { int ret = 0; struct kprobe *p; mutex_lock(&kprobe_mutex); /* Disable this kprobe */ p = __disable_kprobe(kp); if (IS_ERR(p)) ret = PTR_ERR(p); mutex_unlock(&kprobe_mutex); return ret; } EXPORT_SYMBOL_GPL(disable_kprobe); /* Enable one kprobe */ int enable_kprobe(struct kprobe *kp) { int ret = 0; struct kprobe *p; mutex_lock(&kprobe_mutex); /* Check whether specified probe is valid. */ p = __get_valid_kprobe(kp); if (unlikely(p == NULL)) { ret = -EINVAL; goto out; } if (kprobe_gone(kp)) { /* This kprobe has gone, we couldn't enable it. */ ret = -EINVAL; goto out; } if (p != kp) kp->flags &= ~KPROBE_FLAG_DISABLED; if (!kprobes_all_disarmed && kprobe_disabled(p)) { p->flags &= ~KPROBE_FLAG_DISABLED; ret = arm_kprobe(p); if (ret) { p->flags |= KPROBE_FLAG_DISABLED; if (p != kp) kp->flags |= KPROBE_FLAG_DISABLED; } } out: mutex_unlock(&kprobe_mutex); return ret; } EXPORT_SYMBOL_GPL(enable_kprobe); /* Caller must NOT call this in usual path. This is only for critical case */ void dump_kprobe(struct kprobe *kp) { pr_err("Dumping kprobe:\n"); pr_err("Name: %s\nOffset: %x\nAddress: %pS\n", kp->symbol_name, kp->offset, kp->addr); } NOKPROBE_SYMBOL(dump_kprobe); int kprobe_add_ksym_blacklist(unsigned long entry) { struct kprobe_blacklist_entry *ent; unsigned long offset = 0, size = 0; if (!kernel_text_address(entry) || !kallsyms_lookup_size_offset(entry, &size, &offset)) return -EINVAL; ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) return -ENOMEM; ent->start_addr = entry; ent->end_addr = entry + size; INIT_LIST_HEAD(&ent->list); list_add_tail(&ent->list, &kprobe_blacklist); return (int)size; } /* Add all symbols in given area into kprobe blacklist */ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) { unsigned long entry; int ret = 0; for (entry = start; entry < end; entry += ret) { ret = kprobe_add_ksym_blacklist(entry); if (ret < 0) return ret; if (ret == 0) /* In case of alias symbol */ ret = 1; } return 0; } /* Remove all symbols in given area from kprobe blacklist */ static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) { struct kprobe_blacklist_entry *ent, *n; list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { if (ent->start_addr < start || ent->start_addr >= end) continue; list_del(&ent->list); kfree(ent); } } static void kprobe_remove_ksym_blacklist(unsigned long entry) { kprobe_remove_area_blacklist(entry, entry + 1); } int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, char *type, char *sym) { return -ERANGE; } int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { #ifdef __ARCH_WANT_KPROBES_INSN_SLOT if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym)) return 0; #ifdef CONFIG_OPTPROBES if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym)) return 0; #endif #endif if (!arch_kprobe_get_kallsym(&symnum, value, type, sym)) return 0; return -ERANGE; } int __init __weak arch_populate_kprobe_blacklist(void) { return 0; } /* * Lookup and populate the kprobe_blacklist. * * Unlike the kretprobe blacklist, we'll need to determine * the range of addresses that belong to the said functions, * since a kprobe need not necessarily be at the beginning * of a function. */ static int __init populate_kprobe_blacklist(unsigned long *start, unsigned long *end) { unsigned long entry; unsigned long *iter; int ret; for (iter = start; iter < end; iter++) { entry = arch_deref_entry_point((void *)*iter); ret = kprobe_add_ksym_blacklist(entry); if (ret == -EINVAL) continue; if (ret < 0) return ret; } /* Symbols in __kprobes_text are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, (unsigned long)__kprobes_text_end); if (ret) return ret; /* Symbols in noinstr section are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start, (unsigned long)__noinstr_text_end); return ret ? : arch_populate_kprobe_blacklist(); } static void add_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; int i; if (mod->kprobe_blacklist) { for (i = 0; i < mod->num_kprobe_blacklist; i++) kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]); } start = (unsigned long)mod->kprobes_text_start; if (start) { end = start + mod->kprobes_text_size; kprobe_add_area_blacklist(start, end); } start = (unsigned long)mod->noinstr_text_start; if (start) { end = start + mod->noinstr_text_size; kprobe_add_area_blacklist(start, end); } } static void remove_module_kprobe_blacklist(struct module *mod) { unsigned long start, end; int i; if (mod->kprobe_blacklist) { for (i = 0; i < mod->num_kprobe_blacklist; i++) kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]); } start = (unsigned long)mod->kprobes_text_start; if (start) { end = start + mod->kprobes_text_size; kprobe_remove_area_blacklist(start, end); } start = (unsigned long)mod->noinstr_text_start; if (start) { end = start + mod->noinstr_text_size; kprobe_remove_area_blacklist(start, end); } } /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct module *mod = data; struct hlist_head *head; struct kprobe *p; unsigned int i; int checkcore = (val == MODULE_STATE_GOING); if (val == MODULE_STATE_COMING) { mutex_lock(&kprobe_mutex); add_module_kprobe_blacklist(mod); mutex_unlock(&kprobe_mutex); } if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; /* * When MODULE_STATE_GOING was notified, both of module .text and * .init.text sections would be freed. When MODULE_STATE_LIVE was * notified, only .init.text section would be freed. We need to * disable kprobes which have been inserted in the sections. */ mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry(p, head, hlist) { if (kprobe_gone(p)) continue; if (within_module_init((unsigned long)p->addr, mod) || (checkcore && within_module_core((unsigned long)p->addr, mod))) { /* * The vaddr this probe is installed will soon * be vfreed buy not synced to disk. Hence, * disarming the breakpoint isn't needed. * * Note, this will also move any optimized probes * that are pending to be removed from their * corresponding lists to the freeing_list and * will not be touched by the delayed * kprobe_optimizer work handler. */ kill_kprobe(p); } } } if (val == MODULE_STATE_GOING) remove_module_kprobe_blacklist(mod); mutex_unlock(&kprobe_mutex); return NOTIFY_DONE; } static struct notifier_block kprobe_module_nb = { .notifier_call = kprobes_module_callback, .priority = 0 }; /* Markers of _kprobe_blacklist section */ extern unsigned long __start_kprobe_blacklist[]; extern unsigned long __stop_kprobe_blacklist[]; void kprobe_free_init_mem(void) { void *start = (void *)(&__init_begin); void *end = (void *)(&__init_end); struct hlist_head *head; struct kprobe *p; int i; mutex_lock(&kprobe_mutex); /* Kill all kprobes on initmem */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry(p, head, hlist) { if (start <= (void *)p->addr && (void *)p->addr < end) kill_kprobe(p); } } mutex_unlock(&kprobe_mutex); } static int __init init_kprobes(void) { int i, err = 0; /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); } err = populate_kprobe_blacklist(__start_kprobe_blacklist, __stop_kprobe_blacklist); if (err) { pr_err("kprobes: failed to populate blacklist: %d\n", err); pr_err("Please take care of using kprobes.\n"); } if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { kretprobe_blacklist[i].addr = kprobe_lookup_name(kretprobe_blacklist[i].name, 0); if (!kretprobe_blacklist[i].addr) printk("kretprobe: lookup failed: %s\n", kretprobe_blacklist[i].name); } } /* By default, kprobes are armed */ kprobes_all_disarmed = false; #if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT) /* Init kprobe_optinsn_slots for allocation */ kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; #endif err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); if (!err) err = register_module_notifier(&kprobe_module_nb); kprobes_initialized = (err == 0); if (!err) init_test_probes(); return err; } early_initcall(init_kprobes); #if defined(CONFIG_OPTPROBES) static int __init init_optprobes(void) { /* * Enable kprobe optimization - this kicks the optimizer which * depends on synchronize_rcu_tasks() and ksoftirqd, that is * not spawned in early initcall. So delay the optimization. */ optimize_all_kprobes(); return 0; } subsys_initcall(init_optprobes); #endif #ifdef CONFIG_DEBUG_FS static void report_probe(struct seq_file *pi, struct kprobe *p, const char *sym, int offset, char *modname, struct kprobe *pp) { char *kprobe_type; void *addr = p->addr; if (p->pre_handler == pre_handler_kretprobe) kprobe_type = "r"; else kprobe_type = "k"; if (!kallsyms_show_value(pi->file->f_cred)) addr = NULL; if (sym) seq_printf(pi, "%px %s %s+0x%x %s ", addr, kprobe_type, sym, offset, (modname ? modname : " ")); else /* try to use %pS */ seq_printf(pi, "%px %s %pS ", addr, kprobe_type, p->addr); if (!pp) pp = p; seq_printf(pi, "%s%s%s%s\n", (kprobe_gone(p) ? "[GONE]" : ""), ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""), (kprobe_ftrace(pp) ? "[FTRACE]" : "")); } static void *kprobe_seq_start(struct seq_file *f, loff_t *pos) { return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; } static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; if (*pos >= KPROBE_TABLE_SIZE) return NULL; return pos; } static void kprobe_seq_stop(struct seq_file *f, void *v) { /* Nothing to do */ } static int show_kprobe_addr(struct seq_file *pi, void *v) { struct hlist_head *head; struct kprobe *p, *kp; const char *sym = NULL; unsigned int i = *(loff_t *) v; unsigned long offset = 0; char *modname, namebuf[KSYM_NAME_LEN]; head = &kprobe_table[i]; preempt_disable(); hlist_for_each_entry_rcu(p, head, hlist) { sym = kallsyms_lookup((unsigned long)p->addr, NULL, &offset, &modname, namebuf); if (kprobe_aggrprobe(p)) { list_for_each_entry_rcu(kp, &p->list, list) report_probe(pi, kp, sym, offset, modname, p); } else report_probe(pi, p, sym, offset, modname, NULL); } preempt_enable(); return 0; } static const struct seq_operations kprobes_sops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, .show = show_kprobe_addr }; DEFINE_SEQ_ATTRIBUTE(kprobes); /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) { mutex_lock(&kprobe_mutex); return seq_list_start(&kprobe_blacklist, *pos); } static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos) { return seq_list_next(v, &kprobe_blacklist, pos); } static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) { struct kprobe_blacklist_entry *ent = list_entry(v, struct kprobe_blacklist_entry, list); /* * If /proc/kallsyms is not showing kernel address, we won't * show them here either. */ if (!kallsyms_show_value(m->file->f_cred)) seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, (void *)ent->start_addr); else seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, (void *)ent->end_addr, (void *)ent->start_addr); return 0; } static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) { mutex_unlock(&kprobe_mutex); } static const struct seq_operations kprobe_blacklist_sops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist); static int arm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; unsigned int i, total = 0, errors = 0; int err, ret = 0; mutex_lock(&kprobe_mutex); /* If kprobes are armed, just return */ if (!kprobes_all_disarmed) goto already_enabled; /* * optimize_kprobe() called by arm_kprobe() checks * kprobes_all_disarmed, so set kprobes_all_disarmed before * arm_kprobe. */ kprobes_all_disarmed = false; /* Arming kprobes doesn't optimize kprobe itself */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; /* Arm all kprobes on a best-effort basis */ hlist_for_each_entry(p, head, hlist) { if (!kprobe_disabled(p)) { err = arm_kprobe(p); if (err) { errors++; ret = err; } total++; } } } if (errors) pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n", errors, total); else pr_info("Kprobes globally enabled\n"); already_enabled: mutex_unlock(&kprobe_mutex); return ret; } static int disarm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; unsigned int i, total = 0, errors = 0; int err, ret = 0; mutex_lock(&kprobe_mutex); /* If kprobes are already disarmed, just return */ if (kprobes_all_disarmed) { mutex_unlock(&kprobe_mutex); return 0; } kprobes_all_disarmed = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; /* Disarm all kprobes on a best-effort basis */ hlist_for_each_entry(p, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) { err = disarm_kprobe(p, false); if (err) { errors++; ret = err; } total++; } } } if (errors) pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n", errors, total); else pr_info("Kprobes globally disabled\n"); mutex_unlock(&kprobe_mutex); /* Wait for disarming all kprobes by optimizer */ wait_for_kprobe_optimizer(); return ret; } /* * XXX: The debugfs bool file interface doesn't allow for callbacks * when the bool state is switched. We can reuse that facility when * available */ static ssize_t read_enabled_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[3]; if (!kprobes_all_disarmed) buf[0] = '1'; else buf[0] = '0'; buf[1] = '\n'; buf[2] = 0x00; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } static ssize_t write_enabled_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; size_t buf_size; int ret = 0; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; buf[buf_size] = '\0'; switch (buf[0]) { case 'y': case 'Y': case '1': ret = arm_all_kprobes(); break; case 'n': case 'N': case '0': ret = disarm_all_kprobes(); break; default: return -EINVAL; } if (ret) return ret; return count; } static const struct file_operations fops_kp = { .read = read_enabled_file_bool, .write = write_enabled_file_bool, .llseek = default_llseek, }; static int __init debugfs_kprobe_init(void) { struct dentry *dir; dir = debugfs_create_dir("kprobes", NULL); debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops); debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp); debugfs_create_file("blacklist", 0400, dir, NULL, &kprobe_blacklist_fops); return 0; } late_initcall(debugfs_kprobe_init); #endif /* CONFIG_DEBUG_FS */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BARRIER_H #define _ASM_X86_BARRIER_H #include <asm/alternative.h> #include <asm/nops.h> /* * Force strict CPU ordering. * And yes, this might be required on UP too when we're talking * to devices. */ #ifdef CONFIG_X86_32 #define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #else #define mb() asm volatile("mfence":::"memory") #define rmb() asm volatile("lfence":::"memory") #define wmb() asm volatile("sfence" ::: "memory") #endif /** * array_index_mask_nospec() - generate a mask that is ~0UL when the * bounds check succeeds and 0 otherwise * @index: array element index * @size: number of elements in array * * Returns: * 0 - (index < size) */ static inline unsigned long array_index_mask_nospec(unsigned long index, unsigned long size) { unsigned long mask; asm volatile ("cmp %1,%2; sbb %0,%0;" :"=r" (mask) :"g"(size),"r" (index) :"cc"); return mask; } /* Override the default implementation from linux/nospec.h. */ #define array_index_mask_nospec array_index_mask_nospec /* Prevent speculative execution past this barrier. */ #define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC) #define dma_rmb() barrier() #define dma_wmb() barrier() #ifdef CONFIG_X86_32 #define __smp_mb() asm volatile("lock; addl $0,-4(%%esp)" ::: "memory", "cc") #else #define __smp_mb() asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc") #endif #define __smp_rmb() dma_rmb() #define __smp_wmb() barrier() #define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) #define __smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ barrier(); \ WRITE_ONCE(*p, v); \ } while (0) #define __smp_load_acquire(p) \ ({ \ typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ barrier(); \ ___p1; \ }) /* Atomic operations are already serializing on x86 */ #define __smp_mb__before_atomic() do { } while (0) #define __smp_mb__after_atomic() do { } while (0) #include <asm-generic/barrier.h> /* * Make previous memory operations globally visible before * a WRMSR. * * MFENCE makes writes visible, but only affects load/store * instructions. WRMSR is unfortunately not a load/store * instruction and is unaffected by MFENCE. The LFENCE ensures * that the WRMSR is not reordered. * * Most WRMSRs are full serializing instructions themselves and * do not require this barrier. This is only required for the * IA32_TSC_DEADLINE and X2APIC MSRs. */ static inline void weak_wrmsr_fence(void) { asm volatile("mfence; lfence" : : : "memory"); } #endif /* _ASM_X86_BARRIER_H */
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H #define HAVE_JUMP_LABEL_BATCH #define JUMP_LABEL_NOP_SIZE 5 #ifdef CONFIG_X86_64 # define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC #else # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC #endif #include <asm/asm.h> #include <asm/nops.h> #ifndef __ASSEMBLY__ #include <linux/stringify.h> #include <linux/types.h> static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { asm_volatile_goto("1:" ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" ".long 1b - ., %l[l_yes] - . \n\t" _ASM_PTR "%c0 + %c1 - .\n\t" ".popsection \n\t" : : "i" (key), "i" (branch) : : l_yes); return false; l_yes: return true; } static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { asm_volatile_goto("1:" ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t" "2:\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" ".long 1b - ., %l[l_yes] - . \n\t" _ASM_PTR "%c0 + %c1 - .\n\t" ".popsection \n\t" : : "i" (key), "i" (branch) : : l_yes); return false; l_yes: return true; } #else /* __ASSEMBLY__ */ .macro STATIC_JUMP_IF_TRUE target, key, def .Lstatic_jump_\@: .if \def /* Equivalent to "jmp.d32 \target" */ .byte 0xe9 .long \target - .Lstatic_jump_after_\@ .Lstatic_jump_after_\@: .else .byte STATIC_KEY_INIT_NOP .endif .pushsection __jump_table, "aw" _ASM_ALIGN .long .Lstatic_jump_\@ - ., \target - . _ASM_PTR \key - . .popsection .endm .macro STATIC_JUMP_IF_FALSE target, key, def .Lstatic_jump_\@: .if \def .byte STATIC_KEY_INIT_NOP .else /* Equivalent to "jmp.d32 \target" */ .byte 0xe9 .long \target - .Lstatic_jump_after_\@ .Lstatic_jump_after_\@: .endif .pushsection __jump_table, "aw" _ASM_ALIGN .long .Lstatic_jump_\@ - ., \target - . _ASM_PTR \key + 1 - . .popsection .endm #endif /* __ASSEMBLY__ */ #endif
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 // SPDX-License-Identifier: GPL-2.0 /* * Out-of-line refcount functions. */ #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/bug.h> #define REFCOUNT_WARN(str) WARN_ONCE(1, "refcount_t: " str ".\n") void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) { refcount_set(r, REFCOUNT_SATURATED); switch (t) { case REFCOUNT_ADD_NOT_ZERO_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_UAF: REFCOUNT_WARN("addition on 0; use-after-free"); break; case REFCOUNT_SUB_UAF: REFCOUNT_WARN("underflow; use-after-free"); break; case REFCOUNT_DEC_LEAK: REFCOUNT_WARN("decrement hit 0; leaking memory"); break; default: REFCOUNT_WARN("unknown saturation event!?"); } } EXPORT_SYMBOL(refcount_warn_saturate); /** * refcount_dec_if_one - decrement a refcount if it is 1 * @r: the refcount * * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the * success thereof. * * Like all decrement operations, it provides release memory order and provides * a control dependency. * * It can be used like a try-delete operator; this explicit case is provided * and not cmpxchg in generic, because that would allow implementing unsafe * operations. * * Return: true if the resulting refcount is 0, false otherwise */ bool refcount_dec_if_one(refcount_t *r) { int val = 1; return atomic_try_cmpxchg_release(&r->refs, &val, 0); } EXPORT_SYMBOL(refcount_dec_if_one); /** * refcount_dec_not_one - decrement a refcount if it is not 1 * @r: the refcount * * No atomic_t counterpart, it decrements unless the value is 1, in which case * it will return false. * * Was often done like: atomic_add_unless(&var, -1, 1) * * Return: true if the decrement operation was successful, false otherwise */ bool refcount_dec_not_one(refcount_t *r) { unsigned int new, val = atomic_read(&r->refs); do { if (unlikely(val == REFCOUNT_SATURATED)) return true; if (val == 1) return false; new = val - 1; if (new > val) { WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n"); return true; } } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); return true; } EXPORT_SYMBOL(refcount_dec_not_one); /** * refcount_dec_and_mutex_lock - return holding mutex if able to decrement * refcount to 0 * @r: the refcount * @lock: the mutex to be locked * * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail * to decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold mutex if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) { if (refcount_dec_not_one(r)) return false; mutex_lock(lock); if (!refcount_dec_and_test(r)) { mutex_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_mutex_lock); /** * refcount_dec_and_lock - return holding spinlock if able to decrement * refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) { if (refcount_dec_not_one(r)) return false; spin_lock(lock); if (!refcount_dec_and_test(r)) { spin_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock); /** * refcount_dec_and_lock_irqsave - return holding spinlock with disabled * interrupts if able to decrement refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * @flags: saved IRQ-flags if the is acquired * * Same as refcount_dec_and_lock() above except that the spinlock is acquired * with disabled interupts. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) { if (refcount_dec_not_one(r)) return false; spin_lock_irqsave(lock, *flags); if (!refcount_dec_and_test(r)) { spin_unlock_irqrestore(lock, *flags); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock_irqsave);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ #include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/minmax.h> #include <linux/sched.h> #include <linux/thread_info.h> #include <asm/uaccess.h> #ifdef CONFIG_SET_FS /* * Force the uaccess routines to be wired up for actual userspace access, * overriding any possible set_fs(KERNEL_DS) still lingering around. Undone * using force_uaccess_end below. */ static inline mm_segment_t force_uaccess_begin(void) { mm_segment_t fs = get_fs(); set_fs(USER_DS); return fs; } static inline void force_uaccess_end(mm_segment_t oldfs) { set_fs(oldfs); } #else /* CONFIG_SET_FS */ typedef struct { /* empty dummy */ } mm_segment_t; #ifndef TASK_SIZE_MAX #define TASK_SIZE_MAX TASK_SIZE #endif #define uaccess_kernel() (false) #define user_addr_max() (TASK_SIZE_MAX) static inline mm_segment_t force_uaccess_begin(void) { return (mm_segment_t) { }; } static inline void force_uaccess_end(mm_segment_t oldfs) { } #endif /* CONFIG_SET_FS */ /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and * __copy_{to,from}_user{,_inatomic}(). * * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and * return the amount left to copy. They should assume that access_ok() has * already been checked (and succeeded); they should *not* zero-pad anything. * No KASAN or object size checks either - those belong here. * * Both of these functions should attempt to copy size bytes starting at from * into the area starting at to. They must not fetch or store anything * outside of those areas. Return value must be between 0 (everything * copied successfully) and size (nothing copied). * * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting * at to must become equal to the bytes fetched from the corresponding area * starting at from. All data past to + size - N must be left unmodified. * * If copying succeeds, the return value must be 0. If some data cannot be * fetched, it is permitted to copy less than had been fetched; the only * hard requirement is that not storing anything at all (i.e. returning size) * should happen only when nothing could be copied. In other words, you don't * have to squeeze as much as possible - it is allowed, but not necessary. * * For raw_copy_from_user() to always points to kernel memory and no faults * on store should happen. Interpretation of from is affected by set_fs(). * For raw_copy_to_user() it's the other way round. * * Both can be inlined - it's up to architectures whether it wants to bother * with that. They should not be used directly; they are used to implement * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic()) * that are used instead. Out of those, __... ones are inlined. Plain * copy_{to,from}_user() might or might not be inlined. If you want them * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER. * * NOTE: only copy_from_user() zero-pads the destination in case of short copy. * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything * at all; their callers absolutely must check the return value. * * Biarch ones should also provide raw_copy_in_user() - similar to the above, * but both source and destination are __user pointers (affected by set_fs() * as usual) and both source and destination can trigger faults. */ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { instrument_copy_from_user(to, from, n); check_object_size(to, n, false); return raw_copy_from_user(to, from, n); } static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; instrument_copy_from_user(to, from, n); check_object_size(to, n, false); return raw_copy_from_user(to, from, n); } /** * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. * @to: Destination address, in user space. * @from: Source address, in kernel space. * @n: Number of bytes to copy. * * Context: User context only. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. */ static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } #ifdef INLINE_COPY_FROM_USER static inline __must_check unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { instrument_copy_from_user(to, from, n); res = raw_copy_from_user(to, from, n); } if (unlikely(res)) memset(to + (n - res), 0, res); return res; } #else extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); #endif #ifdef INLINE_COPY_TO_USER static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); } return n; } #else extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); #endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { if (likely(check_copy_size(to, n, false))) n = _copy_from_user(to, from, n); return n; } static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { if (likely(check_copy_size(from, n, true))) n = _copy_to_user(to, from, n); return n; } #ifdef CONFIG_COMPAT static __always_inline unsigned long __must_check copy_in_user(void __user *to, const void __user *from, unsigned long n) { might_fault(); if (access_ok(to, n) && access_ok(from, n)) n = raw_copy_in_user(to, from, n); return n; } #endif #ifndef copy_mc_to_kernel /* * Without arch opt-in this generic copy_mc_to_kernel() will not handle * #MC (or arch equivalent) during source read. */ static inline unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, size_t cnt) { memcpy(dst, src, cnt); return 0; } #endif static __always_inline void pagefault_disabled_inc(void) { current->pagefault_disabled++; } static __always_inline void pagefault_disabled_dec(void) { current->pagefault_disabled--; } /* * These routines enable/disable the pagefault handler. If disabled, it will * not take any locks and go straight to the fixup table. * * User access methods will not sleep when called from a pagefault_disabled() * environment. */ static inline void pagefault_disable(void) { pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault * can hit. */ barrier(); } static inline void pagefault_enable(void) { /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ barrier(); pagefault_disabled_dec(); } /* * Is the pagefault handler disabled? If so, user access methods will not sleep. */ static inline bool pagefault_disabled(void) { return current->pagefault_disabled != 0; } /* * The pagefault handler is in general disabled by pagefault_disable() or * when in irq context (via in_atomic()). * * This function should only be used by the fault handlers. Other users should * stick to pagefault_disabled(). * Please NEVER use preempt_disable() to disable the fault handler. With * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled. * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT. */ #define faulthandler_disabled() (pagefault_disabled() || in_atomic()) #ifndef ARCH_HAS_NOCACHE_UACCESS static inline __must_check unsigned long __copy_from_user_inatomic_nocache(void *to, const void __user *from, unsigned long n) { return __copy_from_user_inatomic(to, from, n); } #endif /* ARCH_HAS_NOCACHE_UACCESS */ extern __must_check int check_zeroed_user(const void __user *from, size_t size); /** * copy_struct_from_user: copy a struct from userspace * @dst: Destination address, in kernel space. This buffer must be @ksize * bytes long. * @ksize: Size of @dst struct. * @src: Source address, in userspace. * @usize: (Alleged) size of @src struct. * * Copies a struct from userspace to kernel space, in a way that guarantees * backwards-compatibility for struct syscall arguments (as long as future * struct extensions are made such that all new fields are *appended* to the * old struct, and zeroed-out new fields have the same meaning as the old * struct). * * @ksize is just sizeof(*dst), and @usize should've been passed by userspace. * The recommended usage is something like the following: * * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize) * { * int err; * struct foo karg = {}; * * if (usize > PAGE_SIZE) * return -E2BIG; * if (usize < FOO_SIZE_VER0) * return -EINVAL; * * err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); * if (err) * return err; * * // ... * } * * There are three cases to consider: * * If @usize == @ksize, then it's copied verbatim. * * If @usize < @ksize, then the userspace has passed an old struct to a * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize) * are to be zero-filled. * * If @usize > @ksize, then the userspace has passed a new struct to an * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize) * are checked to ensure they are zeroed, otherwise -E2BIG is returned. * * Returns (in all cases, some data may have been copied): * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src. * * -EFAULT: access to userspace failed. */ static __always_inline __must_check int copy_struct_from_user(void *dst, size_t ksize, const void __user *src, size_t usize) { size_t size = min(ksize, usize); size_t rest = max(ksize, usize) - size; /* Double check if ksize is larger than a known object size. */ if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1))) return -E2BIG; /* Deal with trailing bytes. */ if (usize < ksize) { memset(dst + size, 0, rest); } else if (usize > ksize) { int ret = check_zeroed_user(src + size, rest); if (ret <= 0) return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */ if (copy_from_user(dst, src, size)) return -EFAULT; return 0; } bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size); long copy_from_kernel_nofault(void *dst, const void *src, size_t size); long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size); long copy_from_user_nofault(void *dst, const void __user *src, size_t size); long notrace copy_to_user_nofault(void __user *dst, const void *src, size_t size); long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); /** * get_kernel_nofault(): safely attempt to read from a location * @val: read into this variable * @ptr: address to read from * * Returns 0 on success, or -EFAULT. */ #define get_kernel_nofault(val, ptr) ({ \ const typeof(val) *__gk_ptr = (ptr); \ copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\ }) #ifndef user_access_begin #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) #define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e) #define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e) #define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif #ifndef user_write_access_begin #define user_write_access_begin user_access_begin #define user_write_access_end user_access_end #endif #ifndef user_read_access_begin #define user_read_access_begin user_access_begin #define user_read_access_end user_access_end #endif #ifdef CONFIG_HARDENED_USERCOPY void usercopy_warn(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len); void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len); #endif #endif /* __LINUX_UACCESS_H__ */
4 4 1 1 1 3 4 2 4 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 // SPDX-License-Identifier: GPL-2.0-or-later /* Common capabilities, needed by capability.o. */ #include <linux/capability.h> #include <linux/audit.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/lsm_hooks.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/ptrace.h> #include <linux/xattr.h> #include <linux/hugetlb.h> #include <linux/mount.h> #include <linux/sched.h> #include <linux/prctl.h> #include <linux/securebits.h> #include <linux/user_namespace.h> #include <linux/binfmts.h> #include <linux/personality.h> /* * If a non-root user executes a setuid-root binary in * !secure(SECURE_NOROOT) mode, then we raise capabilities. * However if fE is also set, then the intent is for only * the file capabilities to be applied, and the setuid-root * bit is left on either to change the uid (plausible) or * to get full privilege on a kernel without file capabilities * support. So in that case we do not raise capabilities. * * Warn if that happens, once per boot. */ static void warn_setuid_and_fcaps_mixed(const char *fname) { static int warned; if (!warned) { printk(KERN_INFO "warning: `%s' has both setuid-root and" " effective capabilities. Therefore not raising all" " capabilities.\n", fname); warned = 1; } } /** * cap_capable - Determine whether a task has a particular effective capability * @cred: The credentials to use * @ns: The user namespace in which we need the capability * @cap: The capability to check for * @opts: Bitmask of options defined in include/linux/security.h * * Determine whether the nominated task has the specified capability amongst * its effective set, returning 0 if it does, -ve if it does not. * * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable() * and has_capability() functions. That is, it has the reverse semantics: * cap_has_capability() returns 0 when a task has a capability, but the * kernel's capable() and has_capability() returns 1 for this case. */ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, int cap, unsigned int opts) { struct user_namespace *ns = targ_ns; /* See if cred has the capability in the target user namespace * by examining the target user namespace and all of the target * user namespace's parents. */ for (;;) { /* Do we have the necessary capabilities? */ if (ns == cred->user_ns) return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; /* * If we're already at a lower level than we're looking for, * we're done searching. */ if (ns->level <= cred->user_ns->level) return -EPERM; /* * The owner of the user namespace in the parent of the * user namespace has all caps. */ if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) return 0; /* * If you have a capability in a parent user ns, then you have * it over all children user namespaces as well. */ ns = ns->parent; } /* We never get here */ } /** * cap_settime - Determine whether the current process may set the system clock * @ts: The time to set * @tz: The timezone to set * * Determine whether the current process may set the system clock and timezone * information, returning 0 if permission granted, -ve if denied. */ int cap_settime(const struct timespec64 *ts, const struct timezone *tz) { if (!capable(CAP_SYS_TIME)) return -EPERM; return 0; } /** * cap_ptrace_access_check - Determine whether the current process may access * another * @child: The process to be accessed * @mode: The mode of attachment. * * If we are in the same or an ancestor user_ns and have all the target * task's capabilities, then ptrace access is allowed. * If we have the ptrace capability to the target user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether a process may access another, returning 0 if permission * granted, -ve if denied. */ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; const struct cred *cred, *child_cred; const kernel_cap_t *caller_caps; rcu_read_lock(); cred = current_cred(); child_cred = __task_cred(child); if (mode & PTRACE_MODE_FSCREDS) caller_caps = &cred->cap_effective; else caller_caps = &cred->cap_permitted; if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, *caller_caps)) goto out; if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_ptrace_traceme - Determine whether another process may trace the current * @parent: The task proposed to be the tracer * * If parent is in the same or an ancestor user_ns and has all current's * capabilities, then ptrace access is allowed. * If parent has the ptrace capability to current's user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether the nominated task is permitted to trace the current * process, returning 0 if permission is granted, -ve if denied. */ int cap_ptrace_traceme(struct task_struct *parent) { int ret = 0; const struct cred *cred, *child_cred; rcu_read_lock(); cred = __task_cred(parent); child_cred = current_cred(); if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) goto out; if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_capget - Retrieve a task's capability sets * @target: The task from which to retrieve the capability sets * @effective: The place to record the effective set * @inheritable: The place to record the inheritable set * @permitted: The place to record the permitted set * * This function retrieves the capabilities of the nominated task and returns * them to the caller. */ int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { const struct cred *cred; /* Derived from kernel/capability.c:sys_capget. */ rcu_read_lock(); cred = __task_cred(target); *effective = cred->cap_effective; *inheritable = cred->cap_inheritable; *permitted = cred->cap_permitted; rcu_read_unlock(); return 0; } /* * Determine whether the inheritable capabilities are limited to the old * permitted set. Returns 1 if they are limited, 0 if they are not. */ static inline int cap_inh_is_capped(void) { /* they are so limited unless the current task has the CAP_SETPCAP * capability */ if (cap_capable(current_cred(), current_cred()->user_ns, CAP_SETPCAP, CAP_OPT_NONE) == 0) return 0; return 1; } /** * cap_capset - Validate and apply proposed changes to current's capabilities * @new: The proposed new credentials; alterations should be made here * @old: The current task's current credentials * @effective: A pointer to the proposed new effective capabilities set * @inheritable: A pointer to the proposed new inheritable capabilities set * @permitted: A pointer to the proposed new permitted capabilities set * * This function validates and applies a proposed mass change to the current * process's capability sets. The changes are made to the proposed new * credentials, and assuming no error, will be committed by the caller of LSM. */ int cap_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { if (cap_inh_is_capped() && !cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_permitted))) /* incapable of using this inheritable set */ return -EPERM; if (!cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_bset))) /* no new pI capabilities outside bounding set */ return -EPERM; /* verify restrictions on target's new Permitted set */ if (!cap_issubset(*permitted, old->cap_permitted)) return -EPERM; /* verify the _new_Effective_ is a subset of the _new_Permitted_ */ if (!cap_issubset(*effective, *permitted)) return -EPERM; new->cap_effective = *effective; new->cap_inheritable = *inheritable; new->cap_permitted = *permitted; /* * Mask off ambient bits that are no longer both permitted and * inheritable. */ new->cap_ambient = cap_intersect(new->cap_ambient, cap_intersect(*permitted, *inheritable)); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EINVAL; return 0; } /** * cap_inode_need_killpriv - Determine if inode change affects privileges * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV * * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV * affects the security markings on that inode, and if it is, should * inode_killpriv() be invoked or the change rejected. * * Returns 1 if security.capability has a value, meaning inode_killpriv() * is required, 0 otherwise, meaning inode_killpriv() is not required. */ int cap_inode_need_killpriv(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); int error; error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0); return error > 0; } /** * cap_inode_killpriv - Erase the security markings on an inode * @dentry: The inode/dentry to alter * * Erase the privilege-enhancing security markings on an inode. * * Returns 0 if successful, -ve on error. */ int cap_inode_killpriv(struct dentry *dentry) { int error; error = __vfs_removexattr(dentry, XATTR_NAME_CAPS); if (error == -EOPNOTSUPP) error = 0; return error; } static bool rootid_owns_currentns(kuid_t kroot) { struct user_namespace *ns; if (!uid_valid(kroot)) return false; for (ns = current_user_ns(); ; ns = ns->parent) { if (from_kuid(ns, kroot) == 0) return true; if (ns == &init_user_ns) break; } return false; } static __u32 sansflags(__u32 m) { return m & ~VFS_CAP_FLAGS_EFFECTIVE; } static bool is_v2header(size_t size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_2) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2; } static bool is_v3header(size_t size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_3) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3; } /* * getsecurity: We are called for security.* before any attempt to read the * xattr from the inode itself. * * This gives us a chance to read the on-disk value and convert it. If we * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler. * * Note we are not called by vfs_getxattr_alloc(), but that is only called * by the integrity subsystem, which really wants the unconverted values - * so that's good. */ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc) { int size, ret; kuid_t kroot; u32 nsmagic, magic; uid_t root, mappedroot; char *tmpbuf = NULL; struct vfs_cap_data *cap; struct vfs_ns_cap_data *nscap = NULL; struct dentry *dentry; struct user_namespace *fs_ns; if (strcmp(name, "capability") != 0) return -EOPNOTSUPP; dentry = d_find_any_alias(inode); if (!dentry) return -EINVAL; size = sizeof(struct vfs_ns_cap_data); ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS, &tmpbuf, size, GFP_NOFS); dput(dentry); if (ret < 0 || !tmpbuf) { size = ret; goto out_free; } fs_ns = inode->i_sb->s_user_ns; cap = (struct vfs_cap_data *) tmpbuf; if (is_v2header((size_t) ret, cap)) { root = 0; } else if (is_v3header((size_t) ret, cap)) { nscap = (struct vfs_ns_cap_data *) tmpbuf; root = le32_to_cpu(nscap->rootid); } else { size = -EINVAL; goto out_free; } kroot = make_kuid(fs_ns, root); /* If the root kuid maps to a valid uid in current ns, then return * this as a nscap. */ mappedroot = from_kuid(current_user_ns(), kroot); if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { size = sizeof(struct vfs_ns_cap_data); if (alloc) { if (!nscap) { /* v2 -> v3 conversion */ nscap = kzalloc(size, GFP_ATOMIC); if (!nscap) { size = -ENOMEM; goto out_free; } nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); nscap->magic_etc = cpu_to_le32(nsmagic); } else { /* use allocated v3 buffer */ tmpbuf = NULL; } nscap->rootid = cpu_to_le32(mappedroot); *buffer = nscap; } goto out_free; } if (!rootid_owns_currentns(kroot)) { size = -EOVERFLOW; goto out_free; } /* This comes from a parent namespace. Return as a v2 capability */ size = sizeof(struct vfs_cap_data); if (alloc) { if (nscap) { /* v3 -> v2 conversion */ cap = kzalloc(size, GFP_ATOMIC); if (!cap) { size = -ENOMEM; goto out_free; } magic = VFS_CAP_REVISION_2; nsmagic = le32_to_cpu(nscap->magic_etc); if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) magic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); cap->magic_etc = cpu_to_le32(magic); } else { /* use unconverted v2 */ tmpbuf = NULL; } *buffer = cap; } out_free: kfree(tmpbuf); return size; } static kuid_t rootid_from_xattr(const void *value, size_t size, struct user_namespace *task_ns) { const struct vfs_ns_cap_data *nscap = value; uid_t rootid = 0; if (size == XATTR_CAPS_SZ_3) rootid = le32_to_cpu(nscap->rootid); return make_kuid(task_ns, rootid); } static bool validheader(size_t size, const struct vfs_cap_data *cap) { return is_v2header(size, cap) || is_v3header(size, cap); } /* * User requested a write of security.capability. If needed, update the * xattr to change from v2 to v3, or to fixup the v3 rootid. * * If all is ok, we return the new size, on error return < 0. */ int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size) { struct vfs_ns_cap_data *nscap; uid_t nsrootid; const struct vfs_cap_data *cap = *ivalue; __u32 magic, nsmagic; struct inode *inode = d_backing_inode(dentry); struct user_namespace *task_ns = current_user_ns(), *fs_ns = inode->i_sb->s_user_ns; kuid_t rootid; size_t newsize; if (!*ivalue) return -EINVAL; if (!validheader(size, cap)) return -EINVAL; if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP)) return -EPERM; if (size == XATTR_CAPS_SZ_2) if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP)) /* user is privileged, just write the v2 */ return size; rootid = rootid_from_xattr(*ivalue, size, task_ns); if (!uid_valid(rootid)) return -EINVAL; nsrootid = from_kuid(fs_ns, rootid); if (nsrootid == -1) return -EINVAL; newsize = sizeof(struct vfs_ns_cap_data); nscap = kmalloc(newsize, GFP_ATOMIC); if (!nscap) return -ENOMEM; nscap->rootid = cpu_to_le32(nsrootid); nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; nscap->magic_etc = cpu_to_le32(nsmagic); memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); kvfree(*ivalue); *ivalue = nscap; return newsize; } /* * Calculate the new process capability sets from the capability sets attached * to a file. */ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, struct linux_binprm *bprm, bool *effective, bool *has_fcap) { struct cred *new = bprm->cred; unsigned i; int ret = 0; if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE) *effective = true; if (caps->magic_etc & VFS_CAP_REVISION_MASK) *has_fcap = true; CAP_FOR_EACH_U32(i) { __u32 permitted = caps->permitted.cap[i]; __u32 inheritable = caps->inheritable.cap[i]; /* * pP' = (X & fP) | (pI & fI) * The addition of pA' is handled later. */ new->cap_permitted.cap[i] = (new->cap_bset.cap[i] & permitted) | (new->cap_inheritable.cap[i] & inheritable); if (permitted & ~new->cap_permitted.cap[i]) /* insufficient to execute correctly */ ret = -EPERM; } /* * For legacy apps, with no internal support for recognizing they * do not have enough capabilities, we return an error if they are * missing some "forced" (aka file-permitted) capabilities. */ return *effective ? ret : 0; } /* * Extract the on-exec-apply capability sets for an executable file. */ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) { struct inode *inode = d_backing_inode(dentry); __u32 magic_etc; unsigned tocopy, i; int size; struct vfs_ns_cap_data data, *nscaps = &data; struct vfs_cap_data *caps = (struct vfs_cap_data *) &data; kuid_t rootkuid; struct user_namespace *fs_ns; memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data)); if (!inode) return -ENODATA; fs_ns = inode->i_sb->s_user_ns; size = __vfs_getxattr((struct dentry *)dentry, inode, XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ); if (size == -ENODATA || size == -EOPNOTSUPP) /* no data, that's ok */ return -ENODATA; if (size < 0) return size; if (size < sizeof(magic_etc)) return -EINVAL; cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc); rootkuid = make_kuid(fs_ns, 0); switch (magic_etc & VFS_CAP_REVISION_MASK) { case VFS_CAP_REVISION_1: if (size != XATTR_CAPS_SZ_1) return -EINVAL; tocopy = VFS_CAP_U32_1; break; case VFS_CAP_REVISION_2: if (size != XATTR_CAPS_SZ_2) return -EINVAL; tocopy = VFS_CAP_U32_2; break; case VFS_CAP_REVISION_3: if (size != XATTR_CAPS_SZ_3) return -EINVAL; tocopy = VFS_CAP_U32_3; rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid)); break; default: return -EINVAL; } /* Limit the caps to the mounter of the filesystem * or the more limited uid specified in the xattr. */ if (!rootid_owns_currentns(rootkuid)) return -ENODATA; CAP_FOR_EACH_U32(i) { if (i >= tocopy) break; cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted); cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable); } cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; cpu_caps->rootid = rootkuid; return 0; } /* * Attempt to get the on-exec apply capability sets for an executable file from * its xattrs and, if present, apply them to the proposed credentials being * constructed by execve(). */ static int get_file_caps(struct linux_binprm *bprm, struct file *file, bool *effective, bool *has_fcap) { int rc = 0; struct cpu_vfs_cap_data vcaps; cap_clear(bprm->cred->cap_permitted); if (!file_caps_enabled) return 0; if (!mnt_may_suid(file->f_path.mnt)) return 0; /* * This check is redundant with mnt_may_suid() but is kept to make * explicit that capability bits are limited to s_user_ns and its * descendants. */ if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns)) return 0; rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) printk(KERN_NOTICE "Invalid argument reading file caps for %s\n", bprm->filename); else if (rc == -ENODATA) rc = 0; goto out; } rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap); out: if (rc) cap_clear(bprm->cred->cap_permitted); return rc; } static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); } static inline bool __is_real(kuid_t uid, struct cred *cred) { return uid_eq(cred->uid, uid); } static inline bool __is_eff(kuid_t uid, struct cred *cred) { return uid_eq(cred->euid, uid); } static inline bool __is_suid(kuid_t uid, struct cred *cred) { return !__is_real(uid, cred) && __is_eff(uid, cred); } /* * handle_privileged_root - Handle case of privileged root * @bprm: The execution parameters, including the proposed creds * @has_fcap: Are any file capabilities set? * @effective: Do we have effective root privilege? * @root_uid: This namespace' root UID WRT initial USER namespace * * Handle the case where root is privileged and hasn't been neutered by * SECURE_NOROOT. If file capabilities are set, they won't be combined with * set UID root and nothing is changed. If we are root, cap_permitted is * updated. If we have become set UID root, the effective bit is set. */ static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, bool *effective, kuid_t root_uid) { const struct cred *old = current_cred(); struct cred *new = bprm->cred; if (!root_privileged()) return; /* * If the legacy file capability is set, then don't set privs * for a setuid root binary run by a non-root user. Do set it * for a root user just to cause least surprise to an admin. */ if (has_fcap && __is_suid(root_uid, new)) { warn_setuid_and_fcaps_mixed(bprm->filename); return; } /* * To support inheritance of root-permissions and suid-root * executables under compatibility mode, we override the * capability sets for the file. */ if (__is_eff(root_uid, new) || __is_real(root_uid, new)) { /* pP' = (cap_bset & ~0) | (pI & ~0) */ new->cap_permitted = cap_combine(old->cap_bset, old->cap_inheritable); } /* * If only the real uid is 0, we do not set the effective bit. */ if (__is_eff(root_uid, new)) *effective = true; } #define __cap_gained(field, target, source) \ !cap_issubset(target->cap_##field, source->cap_##field) #define __cap_grew(target, source, cred) \ !cap_issubset(cred->cap_##target, cred->cap_##source) #define __cap_full(field, cred) \ cap_issubset(CAP_FULL_SET, cred->cap_##field) static inline bool __is_setuid(struct cred *new, const struct cred *old) { return !uid_eq(new->euid, old->uid); } static inline bool __is_setgid(struct cred *new, const struct cred *old) { return !gid_eq(new->egid, old->gid); } /* * 1) Audit candidate if current->cap_effective is set * * We do not bother to audit if 3 things are true: * 1) cap_effective has all caps * 2) we became root *OR* are were already root * 3) root is supposed to have all caps (SECURE_NOROOT) * Since this is just a normal root execing a process. * * Number 1 above might fail if you don't have a full bset, but I think * that is interesting information to audit. * * A number of other conditions require logging: * 2) something prevented setuid root getting all caps * 3) non-setuid root gets fcaps * 4) non-setuid root gets ambient */ static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old, kuid_t root, bool has_fcap) { bool ret = false; if ((__cap_grew(effective, ambient, new) && !(__cap_full(effective, new) && (__is_eff(root, new) || __is_real(root, new)) && root_privileged())) || (root_privileged() && __is_suid(root, new) && !__cap_full(effective, new)) || (!__is_setuid(new, old) && ((has_fcap && __cap_gained(permitted, new, old)) || __cap_gained(ambient, new, old)))) ret = true; return ret; } /** * cap_bprm_creds_from_file - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds * @file: The file to pull the credentials from * * Set up the proposed credentials for a new execution context being * constructed by execve(). The proposed creds in @bprm->cred is altered, * which won't take effect immediately. Returns 0 if successful, -ve on error. */ int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file) { /* Process setpcap binaries and capabilities for uid 0 */ const struct cred *old = current_cred(); struct cred *new = bprm->cred; bool effective = false, has_fcap = false, is_setid; int ret; kuid_t root_uid; if (WARN_ON(!cap_ambient_invariant_ok(old))) return -EPERM; ret = get_file_caps(bprm, file, &effective, &has_fcap); if (ret < 0) return ret; root_uid = make_kuid(new->user_ns, 0); handle_privileged_root(bprm, has_fcap, &effective, root_uid); /* if we have fs caps, clear dangerous personality flags */ if (__cap_gained(permitted, new, old)) bprm->per_clear |= PER_CLEAR_ON_SETID; /* Don't let someone trace a set[ug]id/setpcap binary with the revised * credentials unless they have the appropriate permit. * * In addition, if NO_NEW_PRIVS, then ensure we get no new privs. */ is_setid = __is_setuid(new, old) || __is_setgid(new, old); if ((is_setid || __cap_gained(permitted, new, old)) && ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) || !ptracer_capable(current, new->user_ns))) { /* downgrade; they get no more than they had, and maybe less */ if (!ns_capable(new->user_ns, CAP_SETUID) || (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) { new->euid = new->uid; new->egid = new->gid; } new->cap_permitted = cap_intersect(new->cap_permitted, old->cap_permitted); } new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; /* File caps or setid cancels ambient. */ if (has_fcap || is_setid) cap_clear(new->cap_ambient); /* * Now that we've computed pA', update pP' to give: * pP' = (X & fP) | (pI & fI) | pA' */ new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient); /* * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set, * this is the same as pE' = (fE ? pP' : 0) | pA'. */ if (effective) new->cap_effective = new->cap_permitted; else new->cap_effective = new->cap_ambient; if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; if (nonroot_raised_pE(new, old, root_uid, has_fcap)) { ret = audit_log_bprm_fcaps(bprm, new, old); if (ret < 0) return ret; } new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; /* Check for privilege-elevated exec. */ if (is_setid || (!__is_real(root_uid, new) && (effective || __cap_grew(permitted, ambient, new)))) bprm->secureexec = 1; return 0; } /** * cap_inode_setxattr - Determine whether an xattr may be altered * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * @value: The value that the xattr will be changed to * @size: The size of value * @flags: The replacement flag * * Determine whether an xattr may be altered or set on an inode, returning 0 if * permission is granted, -ve if denied. * * This is used to make sure security xattrs don't get updated or set by those * who aren't privileged to do so. */ int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; /* * For XATTR_NAME_CAPS the check will be done in * cap_convert_nscap(), called by setxattr() */ if (strcmp(name, XATTR_NAME_CAPS) == 0) return 0; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /** * cap_inode_removexattr - Determine whether an xattr may be removed * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * * Determine whether an xattr may be removed from an inode, returning 0 if * permission is granted, -ve if denied. * * This is used to make sure security xattrs don't get removed by those who * aren't privileged to remove them. */ int cap_inode_removexattr(struct dentry *dentry, const char *name) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; if (strcmp(name, XATTR_NAME_CAPS) == 0) { /* security.capability gets namespaced */ struct inode *inode = d_backing_inode(dentry); if (!inode) return -EINVAL; if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP)) return -EPERM; return 0; } if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /* * cap_emulate_setxuid() fixes the effective / permitted capabilities of * a process after a call to setuid, setreuid, or setresuid. * * 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of * {r,e,s}uid != 0, the permitted and effective capabilities are * cleared. * * 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective * capabilities of the process are cleared. * * 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective * capabilities are set to the permitted capabilities. * * fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should * never happen. * * -astor * * cevans - New behaviour, Oct '99 * A process may, via prctl(), elect to keep its capabilities when it * calls setuid() and switches away from uid==0. Both permitted and * effective sets will be retained. * Without this change, it was impossible for a daemon to drop only some * of its privilege. The call to setuid(!=0) would drop all privileges! * Keeping uid 0 is not an option because uid 0 owns too many vital * files.. * Thanks to Olaf Kirch and Peter Benie for spotting this. */ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old) { kuid_t root_uid = make_kuid(old->user_ns, 0); if ((uid_eq(old->uid, root_uid) || uid_eq(old->euid, root_uid) || uid_eq(old->suid, root_uid)) && (!uid_eq(new->uid, root_uid) && !uid_eq(new->euid, root_uid) && !uid_eq(new->suid, root_uid))) { if (!issecure(SECURE_KEEP_CAPS)) { cap_clear(new->cap_permitted); cap_clear(new->cap_effective); } /* * Pre-ambient programs expect setresuid to nonroot followed * by exec to drop capabilities. We should make sure that * this remains the case. */ cap_clear(new->cap_ambient); } if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid)) cap_clear(new->cap_effective); if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid)) new->cap_effective = new->cap_permitted; } /** * cap_task_fix_setuid - Fix up the results of setuid() call * @new: The proposed credentials * @old: The current task's current credentials * @flags: Indications of what has changed * * Fix up the results of setuid() call before the credential changes are * actually applied, returning 0 to grant the changes, -ve to deny them. */ int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { switch (flags) { case LSM_SETID_RE: case LSM_SETID_ID: case LSM_SETID_RES: /* juggle the capabilities to follow [RES]UID changes unless * otherwise suppressed */ if (!issecure(SECURE_NO_SETUID_FIXUP)) cap_emulate_setxuid(new, old); break; case LSM_SETID_FS: /* juggle the capabilties to follow FSUID changes, unless * otherwise suppressed * * FIXME - is fsuser used for all CAP_FS_MASK capabilities? * if not, we might be a bit too harsh here. */ if (!issecure(SECURE_NO_SETUID_FIXUP)) { kuid_t root_uid = make_kuid(old->user_ns, 0); if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid)) new->cap_effective = cap_drop_fs_set(new->cap_effective); if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid)) new->cap_effective = cap_raise_fs_set(new->cap_effective, new->cap_permitted); } break; default: return -EINVAL; } return 0; } /* * Rationale: code calling task_setscheduler, task_setioprio, and * task_setnice, assumes that * . if capable(cap_sys_nice), then those actions should be allowed * . if not capable(cap_sys_nice), but acting on your own processes, * then those actions should be allowed * This is insufficient now since you can call code without suid, but * yet with increased caps. * So we check for increased caps on the target process. */ static int cap_safe_nice(struct task_struct *p) { int is_subset, ret = 0; rcu_read_lock(); is_subset = cap_issubset(__task_cred(p)->cap_permitted, current_cred()->cap_permitted); if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) ret = -EPERM; rcu_read_unlock(); return ret; } /** * cap_task_setscheduler - Detemine if scheduler policy change is permitted * @p: The task to affect * * Detemine if the requested scheduler policy change is permitted for the * specified task, returning 0 if permission is granted, -ve if denied. */ int cap_task_setscheduler(struct task_struct *p) { return cap_safe_nice(p); } /** * cap_task_ioprio - Detemine if I/O priority change is permitted * @p: The task to affect * @ioprio: The I/O priority to set * * Detemine if the requested I/O priority change is permitted for the specified * task, returning 0 if permission is granted, -ve if denied. */ int cap_task_setioprio(struct task_struct *p, int ioprio) { return cap_safe_nice(p); } /** * cap_task_ioprio - Detemine if task priority change is permitted * @p: The task to affect * @nice: The nice value to set * * Detemine if the requested task priority change is permitted for the * specified task, returning 0 if permission is granted, -ve if denied. */ int cap_task_setnice(struct task_struct *p, int nice) { return cap_safe_nice(p); } /* * Implement PR_CAPBSET_DROP. Attempt to remove the specified capability from * the current task's bounding set. Returns 0 on success, -ve on error. */ static int cap_prctl_drop(unsigned long cap) { struct cred *new; if (!ns_capable(current_user_ns(), CAP_SETPCAP)) return -EPERM; if (!cap_valid(cap)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; cap_lower(new->cap_bset, cap); return commit_creds(new); } /** * cap_task_prctl - Implement process control functions for this security module * @option: The process control function requested * @arg2, @arg3, @arg4, @arg5: The argument data for this function * * Allow process control functions (sys_prctl()) to alter capabilities; may * also deny access to other functions not otherwise implemented here. * * Returns 0 or +ve on success, -ENOSYS if this function is not implemented * here, other -ve on error. If -ENOSYS is returned, sys_prctl() and other LSM * modules will consider performing the function. */ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { const struct cred *old = current_cred(); struct cred *new; switch (option) { case PR_CAPBSET_READ: if (!cap_valid(arg2)) return -EINVAL; return !!cap_raised(old->cap_bset, arg2); case PR_CAPBSET_DROP: return cap_prctl_drop(arg2); /* * The next four prctl's remain to assist with transitioning a * system from legacy UID=0 based privilege (when filesystem * capabilities are not in use) to a system using filesystem * capabilities only - as the POSIX.1e draft intended. * * Note: * * PR_SET_SECUREBITS = * issecure_mask(SECURE_KEEP_CAPS_LOCKED) * | issecure_mask(SECURE_NOROOT) * | issecure_mask(SECURE_NOROOT_LOCKED) * | issecure_mask(SECURE_NO_SETUID_FIXUP) * | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED) * * will ensure that the current process and all of its * children will be locked into a pure * capability-based-privilege environment. */ case PR_SET_SECUREBITS: if ((((old->securebits & SECURE_ALL_LOCKS) >> 1) & (old->securebits ^ arg2)) /*[1]*/ || ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/ || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ || (cap_capable(current_cred(), current_cred()->user_ns, CAP_SETPCAP, CAP_OPT_NONE) != 0) /*[4]*/ /* * [1] no changing of bits that are locked * [2] no unlocking of locks * [3] no setting of unsupported bits * [4] doing anything requires privilege (go read about * the "sendmail capabilities bug") */ ) /* cannot change a locked bit */ return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; new->securebits = arg2; return commit_creds(new); case PR_GET_SECUREBITS: return old->securebits; case PR_GET_KEEPCAPS: return !!issecure(SECURE_KEEP_CAPS); case PR_SET_KEEPCAPS: if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */ return -EINVAL; if (issecure(SECURE_KEEP_CAPS_LOCKED)) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (arg2) new->securebits |= issecure_mask(SECURE_KEEP_CAPS); else new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); return commit_creds(new); case PR_CAP_AMBIENT: if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) { if (arg3 | arg4 | arg5) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; cap_clear(new->cap_ambient); return commit_creds(new); } if (((!cap_valid(arg3)) | arg4 | arg5)) return -EINVAL; if (arg2 == PR_CAP_AMBIENT_IS_SET) { return !!cap_raised(current_cred()->cap_ambient, arg3); } else if (arg2 != PR_CAP_AMBIENT_RAISE && arg2 != PR_CAP_AMBIENT_LOWER) { return -EINVAL; } else { if (arg2 == PR_CAP_AMBIENT_RAISE && (!cap_raised(current_cred()->cap_permitted, arg3) || !cap_raised(current_cred()->cap_inheritable, arg3) || issecure(SECURE_NO_CAP_AMBIENT_RAISE))) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (arg2 == PR_CAP_AMBIENT_RAISE) cap_raise(new->cap_ambient, arg3); else cap_lower(new->cap_ambient, arg3); return commit_creds(new); } default: /* No functionality available - continue with default */ return -ENOSYS; } } /** * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted * @mm: The VM space in which the new mapping is to be made * @pages: The size of the mapping * * Determine whether the allocation of a new virtual mapping by the current * task is permitted, returning 1 if permission is granted, 0 if not. */ int cap_vm_enough_memory(struct mm_struct *mm, long pages) { int cap_sys_admin = 0; if (cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0) cap_sys_admin = 1; return cap_sys_admin; } /* * cap_mmap_addr - check if able to map given addr * @addr: address attempting to be mapped * * If the process is attempting to map memory below dac_mmap_min_addr they need * CAP_SYS_RAWIO. The other parameters to this function are unused by the * capability security module. Returns 0 if this mapping should be allowed * -EPERM if not. */ int cap_mmap_addr(unsigned long addr) { int ret = 0; if (addr < dac_mmap_min_addr) { ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO, CAP_OPT_NONE); /* set PF_SUPERPRIV if it turns out we allow the low mmap */ if (ret == 0) current->flags |= PF_SUPERPRIV; } return ret; } int cap_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { return 0; } #ifdef CONFIG_SECURITY static struct security_hook_list capability_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(capable, cap_capable), LSM_HOOK_INIT(settime, cap_settime), LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme), LSM_HOOK_INIT(capget, cap_capget), LSM_HOOK_INIT(capset, cap_capset), LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file), LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv), LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv), LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity), LSM_HOOK_INIT(mmap_addr, cap_mmap_addr), LSM_HOOK_INIT(mmap_file, cap_mmap_file), LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid), LSM_HOOK_INIT(task_prctl, cap_task_prctl), LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler), LSM_HOOK_INIT(task_setioprio, cap_task_setioprio), LSM_HOOK_INIT(task_setnice, cap_task_setnice), LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory), }; static int __init capability_init(void) { security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks), "capability"); return 0; } DEFINE_LSM(capability) = { .name = "capability", .order = LSM_ORDER_FIRST, .init = capability_init, }; #endif /* CONFIG_SECURITY */
4 4 4 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 // SPDX-License-Identifier: GPL-2.0 /* * Lockless hierarchical page accounting & limiting * * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner */ #include <linux/page_counter.h> #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/bug.h> #include <asm/page.h> static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { unsigned long protected, old_protected; unsigned long low, min; long delta; if (!c->parent) return; min = READ_ONCE(c->min); if (min || atomic_long_read(&c->min_usage)) { protected = min(usage, min); old_protected = atomic_long_xchg(&c->min_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_min_usage); } low = READ_ONCE(c->low); if (low || atomic_long_read(&c->low_usage)) { protected = min(usage, low); old_protected = atomic_long_xchg(&c->low_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_low_usage); } } /** * page_counter_cancel - take pages out of the local counter * @counter: counter * @nr_pages: number of pages to cancel */ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; new = atomic_long_sub_return(nr_pages, &counter->usage); propagate_protected_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } /** * page_counter_charge - hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * * NOTE: This does not consider any configured counter limits. */ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; for (c = counter; c; c = c->parent) { long new; new = atomic_long_add_return(nr_pages, &c->usage); propagate_protected_usage(c, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. */ if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } } /** * page_counter_try_charge - try to hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * @fail: points first counter to hit its limit, if any * * Returns %true on success, or %false and @fail if the counter or one * of its ancestors has hit its configured limit. */ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail) { struct page_counter *c; for (c = counter; c; c = c->parent) { long new; /* * Charge speculatively to avoid an expensive CAS. If * a bigger charge fails, it might falsely lock out a * racing smaller charge and send it into reclaim * early, but the error is limited to the difference * between the two sizes, which is less than 2M/4M in * case of a THP locking out a regular page charge. * * The atomic_long_add_return() implies a full memory * barrier between incrementing the count and reading * the limit. When racing with page_counter_set_max(), * we either see the new limit or the setter sees the * counter has changed and retries. */ new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); propagate_protected_usage(c, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt which is only used * to report stats. */ data_race(c->failcnt++); *fail = c; goto failed; } propagate_protected_usage(c, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. */ if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } return true; failed: for (c = counter; c != *fail; c = c->parent) page_counter_cancel(c, nr_pages); return false; } /** * page_counter_uncharge - hierarchically uncharge pages * @counter: counter * @nr_pages: number of pages to uncharge */ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; for (c = counter; c; c = c->parent) page_counter_cancel(c, nr_pages); } /** * page_counter_set_max - set the maximum number of pages allowed * @counter: counter * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; long usage; /* * Update the limit while making sure that it's not * below the concurrently-changing counter value. * * The xchg implies two full memory barriers before * and after, so the read-swap-read is ordered and * ensures coherency with page_counter_try_charge(): * that function modifies the count before checking * the limit, so if it sees the old limit, we see the * modified counter and retry. */ usage = atomic_long_read(&counter->usage); if (usage > nr_pages) return -EBUSY; old = xchg(&counter->max, nr_pages); if (atomic_long_read(&counter->usage) <= usage) return 0; counter->max = old; cond_resched(); } } /** * page_counter_set_min - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->min, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_set_low - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->low, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse * @max: string meaning maximum possible value * @nr_pages: returns the result in number of pages * * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be * limited to %PAGE_COUNTER_MAX. */ int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages) { char *end; u64 bytes; if (!strcmp(buf, max)) { *nr_pages = PAGE_COUNTER_MAX; return 0; } bytes = memparse(buf, &end); if (*end != '\0') return -EINVAL; *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); return 0; }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 /* * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_PERF_EVENT_H #define _LINUX_PERF_EVENT_H #include <uapi/linux/perf_event.h> #include <uapi/linux/bpf_perf_event.h> /* * Kernel-internal data types and definitions: */ #ifdef CONFIG_PERF_EVENTS # include <asm/perf_event.h> # include <asm/local64.h> #endif struct perf_guest_info_callbacks { int (*is_in_guest)(void); int (*is_user_mode)(void); unsigned long (*get_guest_ip)(void); void (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT #include <asm/hw_breakpoint.h> #endif #include <linux/list.h> #include <linux/mutex.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/pid_namespace.h> #include <linux/workqueue.h> #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/irq_work.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/atomic.h> #include <linux/sysfs.h> #include <linux/perf_regs.h> #include <linux/cgroup.h> #include <linux/refcount.h> #include <linux/security.h> #include <asm/local.h> struct perf_callchain_entry { __u64 nr; __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { struct perf_callchain_entry *entry; u32 max_stack; u32 nr; short contexts; bool contexts_maxed; }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long off, unsigned long len); struct perf_raw_frag { union { struct perf_raw_frag *next; unsigned long pad; }; perf_copy_f copy; void *data; u32 size; } __packed; struct perf_raw_record { struct perf_raw_frag frag; u32 size; }; /* * branch stack layout: * nr: number of taken branches stored in entries[] * hw_idx: The low level index of raw branch records * for the most recent branch. * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. * The entries[] is an abstraction of raw branch records, * which may not be stored in age order in HW, e.g. Intel LBR. * The hw_idx is to expose the low level index of raw * branch record for the most recent branch aka entries[0]. * The hw_idx index is between -1 (unknown) and max depth, * which can be retrieved in /sys/devices/cpu/caps/branches. * For the architectures whose raw branch records are * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { __u64 nr; __u64 hw_idx; struct perf_branch_entry entries[]; }; struct task_struct; /* * extra PMU register associated with an event */ struct hw_perf_event_extra { u64 config; /* register value */ unsigned int reg; /* register address or index */ int alloc; /* extra register already allocated */ int idx; /* index in shared_regs->regs[] */ }; /** * struct hw_perf_event - performance event hardware details: */ struct hw_perf_event { #ifdef CONFIG_PERF_EVENTS union { struct { /* hardware */ u64 config; u64 last_tag; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; int idx; int last_cpu; int flags; struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; struct { /* software */ struct hrtimer hrtimer; }; struct { /* tracepoint */ /* for tp_event->class */ struct list_head tp_list; }; struct { /* amd_power */ u64 pwr_acc; u64 ptsc; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* * Crufty hack to avoid the chicken and egg * problem hw_breakpoint has with context * creation and event initalization. */ struct arch_hw_breakpoint info; struct list_head bp_list; }; #endif struct { /* amd_iommu */ u8 iommu_bank; u8 iommu_cntr; u16 padding; u64 conf; u64 conf1; }; }; /* * If the event is a per task event, this will point to the task in * question. See the comment in perf_event_alloc(). */ struct task_struct *target; /* * PMU would store hardware filter configuration * here. */ void *addr_filters; /* Last sync'ed generation of filters */ unsigned long addr_filters_gen; /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ #define PERF_HES_STOPPED 0x01 /* the counter is stopped */ #define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ #define PERF_HES_ARCH 0x04 int state; /* * The last observed hardware counter value, updated with a * local64_cmpxchg() such that pmu::read() can be called nested. */ local64_t prev_count; /* * The period to start the next sample with. */ u64 sample_period; union { struct { /* Sampling */ /* * The period we started this sample with. */ u64 last_period; /* * However much is left of the current period; * note that this is a full 64bit value and * allows for generation of periods longer * than hardware might allow. */ local64_t period_left; }; struct { /* Topdown events counting for context switch */ u64 saved_metric; u64 saved_slots; }; }; /* * State for throttling the event, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 interrupts_seq; u64 interrupts; /* * State for freq target events, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 freq_time_stamp; u64 freq_count_stamp; #endif }; struct perf_event; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ #define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ #define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ /** * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x01 #define PERF_PMU_CAP_NO_NMI 0x02 #define PERF_PMU_CAP_AUX_NO_SG 0x04 #define PERF_PMU_CAP_EXTENDED_REGS 0x08 #define PERF_PMU_CAP_EXCLUSIVE 0x10 #define PERF_PMU_CAP_ITRACE 0x20 #define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x40 #define PERF_PMU_CAP_NO_EXCLUDE 0x80 #define PERF_PMU_CAP_AUX_OUTPUT 0x100 struct perf_output_handle; /** * struct pmu - generic performance monitoring unit */ struct pmu { struct list_head entry; struct module *module; struct device *dev; const struct attribute_group **attr_groups; const struct attribute_group **attr_update; const char *name; int type; /* * various common per-pmu feature flags */ int capabilities; int __percpu *pmu_disable_count; struct perf_cpu_context __percpu *pmu_cpu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; /* number of address filters this PMU can do */ unsigned int nr_addr_filters; /* * Fully disable/enable this PMU, can be used to protect from the PMI * as well as for lazy/batch writing of the MSRs. */ void (*pmu_enable) (struct pmu *pmu); /* optional */ void (*pmu_disable) (struct pmu *pmu); /* optional */ /* * Try and initialize the event for this PMU. * * Returns: * -ENOENT -- @event is not for this PMU * * -ENODEV -- @event is for this PMU but PMU not present * -EBUSY -- @event is for this PMU but PMU temporarily unavailable * -EINVAL -- @event is for this PMU but @event is not valid * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported * -EACCES -- @event is for this PMU, @event is valid, but no privileges * * 0 -- @event is for this PMU and valid * * Other error return values are allowed. */ int (*event_init) (struct perf_event *event); /* * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are * matching hw_perf_event::state flags. */ #define PERF_EF_START 0x01 /* start the counter when adding */ #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ /* * Adds/Removes a counter to/from the PMU, can be done inside a * transaction, see the ->*_txn() methods. * * The add/del callbacks will reserve all hardware resources required * to service the event, this includes any counter constraint * scheduling etc. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on. * * ->add() called without PERF_EF_START should result in the same state * as ->add() followed by ->stop(). * * ->del() must always PERF_EF_UPDATE stop an event. If it calls * ->stop() that must deal with already being stopped without * PERF_EF_UPDATE. */ int (*add) (struct perf_event *event, int flags); void (*del) (struct perf_event *event, int flags); /* * Starts/Stops a counter present on the PMU. * * The PMI handler should stop the counter when perf_event_overflow() * returns !0. ->start() will be used to continue. * * Also used to change the sample period. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on -- will be called from NMI context with the PMU generates * NMIs. * * ->stop() with PERF_EF_UPDATE will read the counter and update * period/count values like ->read() would. * * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. */ void (*start) (struct perf_event *event, int flags); void (*stop) (struct perf_event *event, int flags); /* * Updates the counter value of the event. * * For sampling capable PMUs this will also update the software period * hw_perf_event::period_left field. */ void (*read) (struct perf_event *event); /* * Group events scheduling is treated as a transaction, add * group events as a whole and perform one schedulability test. * If the test fails, roll back the whole group * * Start the transaction, after this ->add() doesn't need to * do schedulability tests. * * Optional. */ void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. * * Optional. */ int (*commit_txn) (struct pmu *pmu); /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. * * Optional. */ void (*cancel_txn) (struct pmu *pmu); /* * Will return the value for perf_event_mmap_page::index for this event, * if no implementation is provided it will default to: event->hw.idx + 1. */ int (*event_idx) (struct perf_event *event); /*optional */ /* * context-switches callback */ void (*sched_task) (struct perf_event_context *ctx, bool sched_in); /* * Kmem cache of PMU specific data */ struct kmem_cache *task_ctx_cache; /* * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) * can be synchronized using this function. See Intel LBR callstack support * implementation and Perf core context switch handling callbacks for usage * examples. */ void (*swap_task_ctx) (struct perf_event_context *prev, struct perf_event_context *next); /* optional */ /* * Set up pmu-private data structures for an AUX area */ void *(*setup_aux) (struct perf_event *event, void **pages, int nr_pages, bool overwrite); /* optional */ /* * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ /* * Take a snapshot of the AUX buffer without touching the event * state, so that preempting ->start()/->stop() callbacks does * not interfere with their logic. Called in PMI context. * * Returns the size of AUX data copied to the output handle. * * Optional. */ long (*snapshot_aux) (struct perf_event *event, struct perf_output_handle *handle, unsigned long size); /* * Validate address range filters: make sure the HW supports the * requested configuration and number of filters; return 0 if the * supplied filters are valid, -errno otherwise. * * Runs in the context of the ioctl()ing process and is not serialized * with the rest of the PMU callbacks. */ int (*addr_filters_validate) (struct list_head *filters); /* optional */ /* * Synchronize address range filter configuration: * translate hw-agnostic filters into hardware configuration in * event::hw::addr_filters. * * Runs as a part of filter sync sequence that is done in ->start() * callback by calling perf_event_addr_filters_sync(). * * May (and should) traverse event::addr_filters::list, for which its * caller provides necessary serialization. */ void (*addr_filters_sync) (struct perf_event *event); /* optional */ /* * Check if event can be used for aux_output purposes for * events of this PMU. * * Runs from perf_event_open(). Should return 0 for "no match" * or non-zero for "match". */ int (*aux_output_match) (struct perf_event *event); /* optional */ /* * Filter events for PMU-specific reasons. */ int (*filter_match) (struct perf_event *event); /* optional */ /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 value); /* optional */ }; enum perf_addr_filter_action_t { PERF_ADDR_FILTER_ACTION_STOP = 0, PERF_ADDR_FILTER_ACTION_START, PERF_ADDR_FILTER_ACTION_FILTER, }; /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage * @path: object file's path for file-based filters * @offset: filter range offset * @size: filter range size (size==0 means single address trigger) * @action: filter/start/stop * * This is a hardware-agnostic filter configuration as specified by the user. */ struct perf_addr_filter { struct list_head entry; struct path path; unsigned long offset; unsigned long size; enum perf_addr_filter_action_t action; }; /** * struct perf_addr_filters_head - container for address range filters * @list: list of filters for this event * @lock: spinlock that serializes accesses to the @list and event's * (and its children's) filter generations. * @nr_file_filters: number of file-based filters * * A child event will use parent's @list (and therefore @lock), so they are * bundled together; see perf_event_addr_filters(). */ struct perf_addr_filters_head { struct list_head list; raw_spinlock_t lock; unsigned int nr_file_filters; }; struct perf_addr_filter_range { unsigned long start; unsigned long size; }; /** * enum perf_event_state - the states of an event: */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -4, PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, }; struct file; struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); /* * Event capabilities. For event_caps and groups caps. * * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) struct swevent_hlist { struct hlist_head heads[SWEVENT_HLIST_SIZE]; struct rcu_head rcu_head; }; #define PERF_ATTACH_CONTEXT 0x01 #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 #define PERF_ATTACH_SCHED_CB 0x20 struct perf_cgroup; struct perf_buffer; struct pmu_event_list { raw_spinlock_t lock; struct list_head list; }; #define for_each_sibling_event(sibling, event) \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) /** * struct perf_event - performance event kernel representation: */ struct perf_event { #ifdef CONFIG_PERF_EVENTS /* * entry onto perf_event_context::event_list; * modifications require ctx->lock * RCU safe iterations. */ struct list_head event_entry; /* * Locked for modification by both ctx->mutex and ctx->lock; holding * either sufficies for read. */ struct list_head sibling_list; struct list_head active_list; /* * Node on the pinned or flexible tree located at the event context; */ struct rb_node group_node; u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the * group in tact which avoids us using the other two entries. */ struct list_head migrate_entry; struct hlist_node hlist_entry; struct list_head active_entry; int nr_siblings; /* Not serialized. Only written during event initialization. */ int event_caps; /* The cumulative AND of all event_caps for events in this group. */ int group_caps; unsigned int group_generation; struct perf_event *group_leader; struct pmu *pmu; void *pmu_private; enum perf_event_state state; unsigned int attach_state; local64_t count; atomic64_t child_count; /* * These are the total time in nanoseconds that the event * has been enabled (i.e. eligible to run, and the task has * been scheduled in, if this is a per-task event) * and running (scheduled onto the CPU), respectively. */ u64 total_time_enabled; u64 total_time_running; u64 tstamp; struct perf_event_attr attr; u16 header_size; u16 id_header_size; u16 read_size; struct hw_perf_event hw; struct perf_event_context *ctx; atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children * events have been enabled and running, respectively. */ atomic64_t child_total_time_enabled; atomic64_t child_total_time_running; /* * Protect attach/detach and child_list: */ struct mutex child_mutex; struct list_head child_list; struct perf_event *parent; int oncpu; int cpu; struct list_head owner_entry; struct task_struct *owner; /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; unsigned long rcu_batches; int rcu_pending; /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; /* delayed work for NMIs and such */ int pending_wakeup; int pending_kill; int pending_disable; struct irq_work pending; atomic_t event_limit; /* address range filters */ struct perf_addr_filters_head addr_filters; /* vma address array for file-based filders */ struct perf_addr_filter_range *addr_filter_ranges; unsigned long addr_filters_gen; /* for aux_output events */ struct perf_event *aux_event; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; struct pid_namespace *ns; u64 id; atomic64_t lost_samples; u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; #ifdef CONFIG_BPF_SYSCALL perf_overflow_handler_t orig_overflow_handler; struct bpf_prog *prog; #endif #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; #endif #endif #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; /* cgroup event is attach to */ #endif #ifdef CONFIG_SECURITY void *security; #endif struct list_head sb_list; #endif /* CONFIG_PERF_EVENTS */ }; struct perf_event_groups { struct rb_root tree; u64 index; }; /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { struct pmu *pmu; /* * Protect the states of the events in the list, * nr_active, and the list: */ raw_spinlock_t lock; /* * Protect the list of events. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change * the list you need to lock both the mutex and the spinlock. */ struct mutex mutex; struct list_head active_ctx_list; struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; struct list_head pinned_active; struct list_head flexible_active; int nr_events; int nr_active; int is_active; int nr_stat; int nr_freq; int rotate_disable; /* * Set when nr_events != nr_active, except tolerant to events not * necessary to be active due to scheduling constraints, such as cgroups. */ int rotate_necessary; refcount_t refcount; struct task_struct *task; /* * Context clock, runs when context enabled. */ u64 time; u64 timestamp; u64 timeoffset; /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. */ struct perf_event_context *parent_ctx; u64 parent_gen; u64 generation; int pin_count; #ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ #endif void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; }; /* * Number of contexts where an event can trigger: * task, softirq, hardirq, nmi. */ #define PERF_NR_CONTEXTS 4 /** * struct perf_event_cpu_context - per cpu event context structure */ struct perf_cpu_context { struct perf_event_context ctx; struct perf_event_context *task_ctx; int active_oncpu; int exclusive; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; unsigned int hrtimer_active; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; struct list_head cgrp_cpuctx_entry; #endif struct list_head sched_cb_entry; int sched_cb_usage; int online; /* * Per-CPU storage for iterators used in visit_groups_merge. The default * storage is of size 2 to hold the CPU and any CPU event iterators. */ int heap_size; struct perf_event **heap; struct perf_event *heap_default[2]; }; struct perf_output_handle { struct perf_event *event; struct perf_buffer *rb; unsigned long wakeup; unsigned long size; u64 aux_flags; union { void *addr; unsigned long head; }; int page; }; struct bpf_perf_event_data_kern { bpf_user_pt_regs_t *regs; struct perf_sample_data *data; struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF /* * perf_cgroup_info keeps track of time_enabled for a cgroup. * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { u64 time; u64 timestamp; u64 timeoffset; int active; }; struct perf_cgroup { struct cgroup_subsys_state css; struct perf_cgroup_info __percpu *info; }; /* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { return container_of(task_css_check(task, perf_event_cgrp_id, ctx ? lockdep_is_held(&ctx->lock) : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ #ifdef CONFIG_PERF_EVENTS extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size); extern int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size); extern void *perf_get_aux(struct perf_output_handle *handle); extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); extern int perf_num_counters(void); extern const char *perf_pmu_name(void); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next); extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); extern void perf_sched_cb_dec(struct pmu *pmu); extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t callback, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); struct perf_sample_data { /* * Fields set by perf_sample_data_init(), group so as to * minimize the cachelines touched. */ u64 addr; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 period; u64 weight; u64 txn; union perf_mem_data_src data_src; /* * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ u64 type; u64 ip; struct { u32 pid; u32 tid; } tid_entry; u64 time; u64 id; u64 stream_id; struct { u32 cpu; u32 reserved; } cpu_entry; struct perf_callchain_entry *callchain; u64 aux_size; struct perf_regs regs_user; struct perf_regs regs_intr; u64 stack_user_size; u64 phys_addr; u64 cgroup; } ____cacheline_aligned; /* default value for data source */ #define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ PERF_MEM_S(LVL, NA) |\ PERF_MEM_S(SNOOP, NA) |\ PERF_MEM_S(LOCK, NA) |\ PERF_MEM_S(TLB, NA)) static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ data->addr = addr; data->raw = NULL; data->br_stack = NULL; data->period = period; data->weight = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; } extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); static inline bool __is_default_overflow_handler(perf_overflow_handler_t overflow_handler) { if (likely(overflow_handler == perf_event_output_forward)) return true; if (unlikely(overflow_handler == perf_event_output_backward)) return true; return false; } #define is_default_overflow_handler(event) \ __is_default_overflow_handler((event)->overflow_handler) #ifdef CONFIG_BPF_SYSCALL static inline bool uses_default_overflow_handler(struct perf_event *event) { if (likely(is_default_overflow_handler(event))) return true; return __is_default_overflow_handler(event->orig_overflow_handler); } #else #define uses_default_overflow_handler(event) \ is_default_overflow_handler(event) #endif extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample); extern void perf_log_lost_samples(struct perf_event *event, u64 lost); static inline bool event_has_any_exclude_flag(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; return attr->exclude_idle || attr->exclude_user || attr->exclude_kernel || attr->exclude_hv || attr->exclude_guest || attr->exclude_host; } static inline bool is_sampling_event(struct perf_event *event) { return event->attr.sample_period != 0; } /* * Return 1 for a software event, 0 for a hardware event */ static inline int is_software_event(struct perf_event *event) { return event->event_caps & PERF_EV_CAP_SOFTWARE; } /* * Return 1 for event in sw context, 0 for event in hw context */ static inline int in_software_context(struct perf_event *event) { return event->ctx->pmu->task_ctx_nr == perf_sw_context; } static inline int is_exclusive_pmu(struct pmu *pmu) { return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; } extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } #endif /* * When generating a perf sample in-line, instead of from an interrupt / * exception, we lack a pt_regs. This is typically used from software events * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. * * We typically don't need a full set, but (for x86) do require: * - ip for PERF_SAMPLE_IP * - cs for user_mode() tests * - sp for PERF_SAMPLE_CALLCHAIN * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) * * NOTE: assumes @regs is otherwise already 0 filled; this is important for * things like PERF_SAMPLE_REGS_INTR. */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } static __always_inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { if (static_key_false(&perf_swevent_enabled[event_id])) __perf_sw_event(event_id, nr, regs, addr); } DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); /* * 'Special' version for the scheduler, it hard assumes no recursion, * which is guaranteed by us not actually scheduling inside other swevents * because those disable preemption. */ static __always_inline void perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { if (static_key_false(&perf_swevent_enabled[event_id])) { struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); perf_fetch_caller_regs(regs); ___perf_sw_event(event_id, nr, regs, addr); } } extern struct static_key_false perf_sched_events; static __always_inline bool perf_sw_migrate_enabled(void) { if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS])) return true; return false; } static inline void perf_event_task_migrate(struct task_struct *task) { if (perf_sw_migrate_enabled()) task->sched_migrated = 1; } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_in(prev, task); if (perf_sw_migrate_enabled() && task->sched_migrated) { struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); perf_fetch_caller_regs(regs); ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0); task->sched_migrated = 0; } } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_out(prev, next); } extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) { /* * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading * the callbacks between a !NULL check and dereferences, to ensure * pending stores/changes to the callback pointers are visible before a * non-NULL perf_guest_cbs is visible to readers, and to prevent a * module from unloading callbacks while readers are active. */ return rcu_dereference(perf_guest_cbs); } extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); extern void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); extern void put_callchain_entry(int rctx); extern int sysctl_perf_event_max_stack; extern int sysctl_perf_event_max_contexts_per_stack; static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->contexts; return 0; } else { ctx->contexts_maxed = true; return -1; /* no more room, stop walking the stack */ } } static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->nr; return 0; } else { return -1; /* no more room, stop walking the stack */ } } extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); int perf_proc_update_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 /* Finer grained perf_event_open(2) access control. */ #define PERF_SECURITY_CPU 1 #define PERF_SECURITY_KERNEL 2 #define PERF_SECURITY_TRACEPOINT 3 static inline int perf_is_paranoid(void) { return sysctl_perf_event_paranoid > -1; } static inline int perf_allow_kernel(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_KERNEL); } static inline int perf_allow_cpu(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); } static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags # define perf_misc_flags(regs) \ (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) # define perf_instruction_pointer(regs) instruction_pointer(regs) #endif #ifndef perf_arch_bpf_user_pt_regs # define perf_arch_bpf_user_pt_regs(regs) regs #endif static inline bool has_branch_stack(struct perf_event *event) { return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; } static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; } static inline bool has_aux(struct perf_event *event) { return event->pmu->setup_aux; } static inline bool is_write_backward(struct perf_event *event) { return !!event->attr.write_backward; } static inline bool has_addr_filter(struct perf_event *event) { return event->pmu->nr_addr_filters; } /* * An inherited event uses parent's filters */ static inline struct perf_addr_filters_head * perf_event_addr_filters(struct perf_event *event) { struct perf_addr_filters_head *ifh = &event->addr_filters; if (event->parent) ifh = &event->parent->addr_filters; return ifh; } extern void perf_event_addr_filters_sync(struct perf_event *event); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } static inline void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { } static inline int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { return -EINVAL; } static inline void * perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void perf_event_task_migrate(struct task_struct *task) { } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { } static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event *perf_get_event(struct file *file) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); } static inline int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } static inline int perf_event_refresh(struct perf_event *event, int refresh) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline int perf_register_guest_info_callbacks (struct perf_guest_info_callbacks *callbacks) { return 0; } static inline int perf_unregister_guest_info_callbacks (struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } static inline void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } static inline int perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else static inline void perf_restore_debug_store(void) { } #endif static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) { return frag->pad < sizeof(u64); } #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) struct perf_pmu_events_attr { struct device_attribute attr; u64 id; const char *event_str; }; struct perf_pmu_events_ht_attr { struct device_attribute attr; u64 id; const char *event_str_ht; const char *event_str_noht; }; ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); #define PMU_EVENT_ATTR(_name, _var, _id, _show) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, \ }; #define PMU_EVENT_ATTR_STRING(_name, _var, _str) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ .id = 0, \ .event_str = _str, \ }; #define PMU_FORMAT_ATTR(_name, _format) \ static ssize_t \ _name##_show(struct device *dev, \ struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ \ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) /* Performance counter hotplug functions */ #ifdef CONFIG_PERF_EVENTS int perf_event_init_cpu(unsigned int cpu); int perf_event_exit_cpu(unsigned int cpu); #else #define perf_event_init_cpu NULL #define perf_event_exit_cpu NULL #endif extern void __weak arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); #endif /* _LINUX_PERF_EVENT_H */
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_USER_NAMESPACE_H #define _LINUX_USER_NAMESPACE_H #include <linux/kref.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/rwsem.h> #include <linux/sysctl.h> #include <linux/err.h> #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ u32 nr_extents; union { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; #define USERNS_SETGROUPS_ALLOWED 1UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED struct ucounts; enum ucount_type { UCOUNT_USER_NAMESPACES, UCOUNT_PID_NAMESPACES, UCOUNT_UTS_NAMESPACES, UCOUNT_IPC_NAMESPACES, UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif UCOUNT_COUNTS, }; struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; struct uid_gid_map projid_map; atomic_t count; struct user_namespace *parent; int level; kuid_t owner; kgid_t group; struct ns_common ns; unsigned long flags; /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP * in its effective capability set at the child ns creation time. */ bool parent_could_setfcap; #ifdef CONFIG_KEYS /* List of joinable keyrings in this namespace. Modification access of * these pointers is controlled by keyring_sem. Once * user_keyring_register is set, it won't be changed, so it can be * accessed directly with READ_ONCE(). */ struct list_head keyring_name_list; struct key *user_keyring_register; struct rw_semaphore keyring_sem; #endif /* Register of per-UID persistent keyrings for this namespace */ #ifdef CONFIG_PERSISTENT_KEYRINGS struct key *persistent_keyring_register; #endif struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; int ucount_max[UCOUNT_COUNTS]; } __randomize_layout; struct ucounts { struct hlist_node node; struct user_namespace *ns; kuid_t uid; int count; atomic_t ucount[UCOUNT_COUNTS]; }; extern struct user_namespace init_user_ns; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) atomic_inc(&ns->count); return ns; } extern int create_user_ns(struct cred *new); extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { if (ns && atomic_dec_and_test(&ns->count)) __put_user_ns(ns); } struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child); extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; } static inline int create_user_ns(struct cred *new) { return -EINVAL; } static inline int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { if (unshare_flags & CLONE_NEWUSER) return -EINVAL; return 0; } static inline void put_user_ns(struct user_namespace *ns) { } static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } static inline bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { return true; } static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; } static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } #endif #endif /* _LINUX_USER_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_H #define _LINUX_FS_H #include <linux/linkage.h> #include <linux/wait_bit.h> #include <linux/kdev_t.h> #include <linux/dcache.h> #include <linux/path.h> #include <linux/stat.h> #include <linux/cache.h> #include <linux/list.h> #include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> #include <linux/xarray.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/pid.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/mm_types.h> #include <linux/capability.h> #include <linux/semaphore.h> #include <linux/fcntl.h> #include <linux/rculist_bl.h> #include <linux/atomic.h> #include <linux/shrinker.h> #include <linux/migrate_mode.h> #include <linux/uidgid.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/workqueue.h> #include <linux/delayed_call.h> #include <linux/uuid.h> #include <linux/errseq.h> #include <linux/ioprio.h> #include <linux/fs_types.h> #include <linux/build_bug.h> #include <linux/stddef.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> struct backing_dev_info; struct bdi_writeback; struct bio; struct export_operations; struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; struct kobject; struct pipe_inode_info; struct poll_table_struct; struct kstatfs; struct vm_area_struct; struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; struct fscrypt_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; struct fs_context; struct fs_parameter_spec; extern void __init inode_init(void); extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); extern struct files_stat_struct files_stat; extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; extern int sysctl_protected_symlinks; extern int sysctl_protected_hardlinks; extern int sysctl_protected_fifos; extern int sysctl_protected_regular; typedef __kernel_rwf_t rwf_t; struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 #define MAY_READ 0x00000004 #define MAY_APPEND 0x00000008 #define MAY_ACCESS 0x00000010 #define MAY_OPEN 0x00000020 #define MAY_CHDIR 0x00000040 /* called from RCU mode, don't block */ #define MAY_NOT_BLOCK 0x00000080 /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open() */ /* file is open for reading */ #define FMODE_READ ((__force fmode_t)0x1) /* file is open for writing */ #define FMODE_WRITE ((__force fmode_t)0x2) /* file is seekable */ #define FMODE_LSEEK ((__force fmode_t)0x4) /* file can be accessed using pread */ #define FMODE_PREAD ((__force fmode_t)0x8) /* file can be accessed using pwrite */ #define FMODE_PWRITE ((__force fmode_t)0x10) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)0x20) /* File is opened with O_NDELAY (only set for block devices) */ #define FMODE_NDELAY ((__force fmode_t)0x40) /* File is opened with O_EXCL (only set for block devices) */ #define FMODE_EXCL ((__force fmode_t)0x80) /* File is opened using open(.., 3, ..) and is writeable only for ioctls (specialy hack for floppy.c) */ #define FMODE_WRITE_IOCTL ((__force fmode_t)0x100) /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)0x200) /* 64bit hashes as llseek() offset (for directories) */ #define FMODE_64BITHASH ((__force fmode_t)0x400) /* * Don't update ctime and mtime. * * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. */ #define FMODE_NOCMTIME ((__force fmode_t)0x800) /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)0x1000) /* File is huge (eg. /dev/kmem): treat loff_t as unsigned */ #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)0x4000) /* File needs atomic accesses to f_pos */ #define FMODE_ATOMIC_POS ((__force fmode_t)0x8000) /* Write access to underlying fs */ #define FMODE_WRITER ((__force fmode_t)0x10000) /* Has read method(s) */ #define FMODE_CAN_READ ((__force fmode_t)0x20000) /* Has write method(s) */ #define FMODE_CAN_WRITE ((__force fmode_t)0x40000) #define FMODE_OPENED ((__force fmode_t)0x80000) #define FMODE_CREATED ((__force fmode_t)0x100000) /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)0x200000) /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)0x8000000) /* File represents mount that needs unmounting */ #define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000) /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) /* File supports async buffered reads */ #define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000) /* * Attribute flags. These should be or-ed together to figure out what * has been changed! */ #define ATTR_MODE (1 << 0) #define ATTR_UID (1 << 1) #define ATTR_GID (1 << 2) #define ATTR_SIZE (1 << 3) #define ATTR_ATIME (1 << 4) #define ATTR_MTIME (1 << 5) #define ATTR_CTIME (1 << 6) #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) #define ATTR_KILL_PRIV (1 << 14) #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) #define ATTR_TOUCH (1 << 17) /* * Whiteout is represented by a char device. The following constants define the * mode and device number to use. */ #define WHITEOUT_MODE 0 #define WHITEOUT_DEV 0 /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. * Also, in this manner, a Filesystem can look at only the values it cares * about. Basically, these are the attributes that the VFS layer can * request to change from the FS layer. * * Derek Atkins <warlord@MIT.EDU> 94-10-20 */ struct iattr { unsigned int ia_valid; umode_t ia_mode; kuid_t ia_uid; kgid_t ia_gid; loff_t ia_size; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; /* * Not an attribute, but an auxiliary info for filesystems wanting to * implement an ftruncate() like method. NOTE: filesystem should * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). */ struct file *ia_file; }; /* * Includes for diskquotas. */ #include <linux/quota.h> /* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow */ #define FILESYSTEM_MAX_STACK_DEPTH 2 /** * enum positive_aop_returns - aop return codes with specific semantics * * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has * completed, that the page is still locked, and * should be considered active. The VM uses this hint * to return the page to the active list -- it won't * be a candidate for writeback again in the near * future. Other callers must be careful to unlock * the page if they get this return. Returned by * writepage(); * * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has * unlocked it and the page might have been truncated. * The caller should back up to acquiring a new page and * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned * by readpage(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a * page to allow for functions that return the number of bytes operated on in a * given page. */ enum positive_aop_returns { AOP_WRITEPAGE_ACTIVATE = 0x80000, AOP_TRUNCATED_PAGE = 0x80001, }; #define AOP_FLAG_CONT_EXPAND 0x0001 /* called from cont_expand */ #define AOP_FLAG_NOFS 0x0002 /* used by filesystem to direct * helper code (eg buffer layer) * to clear GFP_FS from alloc */ /* * oh the beauties of C type declarations. */ struct page; struct address_space; struct writeback_control; struct readahead_control; /* * Write life time hint values. * Stored in struct inode as u8. */ enum rw_hint { WRITE_LIFE_NOT_SET = 0, WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, }; /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) #define IOCB_DIRECT (1 << 17) #define IOCB_WRITE (1 << 18) /* iocb->ki_waitq is valid */ #define IOCB_WAITQ (1 << 19) #define IOCB_NOIO (1 << 20) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) struct kiocb { struct file *ki_filp; /* The 'ki_filp' pointer is shared in a union for aio */ randomized_struct_fields_start loff_t ki_pos; void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; u16 ki_hint; u16 ki_ioprio; /* See linux/ioprio.h */ union { unsigned int ki_cookie; /* for ->iopoll */ struct wait_page_queue *ki_waitq; /* for async buffered IO */ }; randomized_struct_fields_end }; static inline bool is_sync_kiocb(struct kiocb *kiocb) { return kiocb->ki_complete == NULL; } /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet * have multiple different users of the data that * we read from a file. * * The simplest case just copies the data to user * mode. */ typedef struct { size_t written; size_t count; union { char __user *buf; void *data; } arg; int error; } read_descriptor_t; typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); /* Set a page dirty. Return true if this dirtied it */ int (*set_page_dirty)(struct page *page); /* * Reads in the requested pages. Unlike ->readpage(), this is * PURELY used for read-ahead!. */ int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a page to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. */ int (*migratepage) (struct address_space *, struct page *, struct page *, enum migrate_mode); bool (*isolate_page)(struct page *, isolate_mode_t); void (*putback_page)(struct page *); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, unsigned long, unsigned long); void (*is_dirty_writeback) (struct page *, bool *, bool *); int (*error_remove_page)(struct address_space *, struct page *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); }; extern const struct address_space_operations empty_aops; /* * pagecache_write_begin/pagecache_write_end must be used by general code * to write into the pagecache. */ int pagecache_write_begin(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); int pagecache_write_end(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); /** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. * @i_pages: Cached pages. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. * @private_lock: For use by the owner of the address_space. * @private_list: For use by the owner of the address_space. * @private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; struct xarray i_pages; gfp_t gfp_mask; atomic_t i_mmap_writable; #ifdef CONFIG_READ_ONLY_THP_FOR_FS /* number of thp, only for non-shmem files */ atomic_t nr_thps; #endif struct rb_root_cached i_mmap; struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; unsigned long nrexceptional; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; errseq_t wb_err; spinlock_t private_lock; struct list_head private_list; void *private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ /* XArray tags, for tagging dirty and writeback pages in the pagecache. */ #define PAGECACHE_TAG_DIRTY XA_MARK_0 #define PAGECACHE_TAG_WRITEBACK XA_MARK_1 #define PAGECACHE_TAG_TOWRITE XA_MARK_2 /* * Returns true if any of the pages in the mapping are marked with the tag. */ static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } static inline void i_mmap_lock_write(struct address_space *mapping) { down_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_write(struct address_space *mapping) { return down_write_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_write(struct address_space *mapping) { up_write(&mapping->i_mmap_rwsem); } static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_read(struct address_space *mapping) { up_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_locked(struct address_space *mapping) { lockdep_assert_held(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_write_locked(struct address_space *mapping) { lockdep_assert_held_write(&mapping->i_mmap_rwsem); } /* * Might pages of this file be mapped into userspace? */ static inline int mapping_mapped(struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } /* * Might pages of this file have been modified in userspace? * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } static inline int mapping_map_writable(struct address_space *mapping) { return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? 0 : -EPERM; } static inline void mapping_unmap_writable(struct address_space *mapping) { atomic_dec(&mapping->i_mmap_writable); } static inline int mapping_deny_writable(struct address_space *mapping) { return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? 0 : -EBUSY; } static inline void mapping_allow_writable(struct address_space *mapping) { atomic_inc(&mapping->i_mmap_writable); } /* * Use sequence counter to get consistent i_size on 32-bit processors. */ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __NEED_I_SIZE_ORDERED #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) #else #define i_size_ordered_init(inode) do { } while (0) #endif struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) #define ACL_DONT_CACHE ((void *)(-3)) static inline struct posix_acl * uncached_acl_sentinel(struct task_struct *task) { return (void *)task + 1; } static inline bool is_uncached_acl(struct posix_acl *acl) { return (long)acl & 1; } #define IOP_FASTPERM 0x0001 #define IOP_LOOKUP 0x0002 #define IOP_NOFOLLOW 0x0004 #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 struct fsnotify_mark_connector; /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode' */ struct inode { umode_t i_mode; unsigned short i_opflags; kuid_t i_uid; kgid_t i_gid; unsigned int i_flags; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif const struct inode_operations *i_op; struct super_block *i_sb; struct address_space *i_mapping; #ifdef CONFIG_SECURITY void *i_security; #endif /* Stat data, not accessed from path walking */ unsigned long i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: * * (set|clear|inc|drop)_nlink * inode_(inc|dec)_link_count */ union { const unsigned int i_nlink; unsigned int __i_nlink; }; dev_t i_rdev; loff_t i_size; struct timespec64 i_atime; struct timespec64 i_mtime; struct timespec64 i_ctime; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; u8 i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; #endif /* Misc */ unsigned long i_state; struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; struct hlist_node i_hash; struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ /* foreign inode detection, see wbc_detach_inode() */ int i_wb_frn_winner; u16 i_wb_frn_avg_time; u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; struct rcu_head i_rcu; }; atomic64_t i_version; atomic64_t i_sequence; /* see futex */ atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) atomic_t i_readcount; /* struct files open RO */ #endif union { const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ void (*free_inode)(struct inode *); }; struct file_lock_context *i_flctx; struct address_space i_data; struct list_head i_devices; union { struct pipe_inode_info *i_pipe; struct block_device *i_bdev; struct cdev *i_cdev; char *i_link; unsigned i_dir_seq; }; __u32 i_generation; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_info *i_crypt_info; #endif #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif void *i_private; /* fs or device private pointer */ } __randomize_layout; struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode); static inline unsigned int i_blocksize(const struct inode *node) { return (1 << node->i_blkbits); } static inline int inode_unhashed(struct inode *inode) { return hlist_unhashed(&inode->i_hash); } /* * __mark_inode_dirty expects inodes to be hashed. Since we don't * want special inodes in the fileset inode space, we make them * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ static inline void inode_fake_hash(struct inode *inode) { hlist_add_fake(&inode->i_hash); } /* * inode->i_mutex nesting subclasses for the lock validator: * * 0: the object of the current VFS operation * 1: parent * 2: child/target * 3: xattr * 4: second non-directory * 5: second parent (when locking independent directories in rename) * * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, I_MUTEX_NONDIR2, I_MUTEX_PARENT2, }; static inline void inode_lock(struct inode *inode) { down_write(&inode->i_rwsem); } static inline void inode_unlock(struct inode *inode) { up_write(&inode->i_rwsem); } static inline void inode_lock_shared(struct inode *inode) { down_read(&inode->i_rwsem); } static inline void inode_unlock_shared(struct inode *inode) { up_read(&inode->i_rwsem); } static inline int inode_trylock(struct inode *inode) { return down_write_trylock(&inode->i_rwsem); } static inline int inode_trylock_shared(struct inode *inode) { return down_read_trylock(&inode->i_rwsem); } static inline int inode_is_locked(struct inode *inode) { return rwsem_is_locked(&inode->i_rwsem); } static inline void inode_lock_nested(struct inode *inode, unsigned subclass) { down_write_nested(&inode->i_rwsem, subclass); } static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass) { down_read_nested(&inode->i_rwsem, subclass); } void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic * with respect to the local cpu (unlike with preempt disabled), * but they don't need to be atomic with respect to other cpus like in * true SMP (so they need either to either locally disable irq around * the read or for example on x86 they can be still implemented as a * cmpxchg8b without the need of the lock prefix). For SMP compiles * and 64bit archs it makes no difference if preempt is enabled or not. */ static inline loff_t i_size_read(const struct inode *inode) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) loff_t i_size; unsigned int seq; do { seq = read_seqcount_begin(&inode->i_size_seqcount); i_size = inode->i_size; } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); return i_size; #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) loff_t i_size; preempt_disable(); i_size = inode->i_size; preempt_enable(); return i_size; #else return inode->i_size; #endif } /* * NOTE: unlike i_size_read(), i_size_write() does need locking around it * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount * can be lost, resulting in subsequent i_size_read() calls spinning forever. */ static inline void i_size_write(struct inode *inode, loff_t i_size) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) preempt_disable(); write_seqcount_begin(&inode->i_size_seqcount); inode->i_size = i_size; write_seqcount_end(&inode->i_size_seqcount); preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) preempt_disable(); inode->i_size = i_size; preempt_enable(); #else inode->i_size = i_size; #endif } static inline unsigned iminor(const struct inode *inode) { return MINOR(inode->i_rdev); } static inline unsigned imajor(const struct inode *inode) { return MAJOR(inode->i_rdev); } struct fown_struct { rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ kuid_t uid, euid; /* uid/euid of process setting the owner */ int signum; /* posix.1b rt signal to be delivered on IO */ }; /* * Track a single file's readahead state */ struct file_ra_state { pgoff_t start; /* where readahead started */ unsigned int size; /* # of readahead pages */ unsigned int async_size; /* do asynchronous readahead when there are only # of pages ahead */ unsigned int ra_pages; /* Maximum readahead window */ unsigned int mmap_miss; /* Cache miss stat for mmap accesses */ loff_t prev_pos; /* Cache last read() position */ }; /* * Check if @index falls in the readahead windows. */ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) { return (index >= ra->start && index < ra->start + ra->size); } struct file { union { struct llist_node fu_llist; struct rcu_head fu_rcuhead; } f_u; struct path f_path; struct inode *f_inode; /* cached value */ const struct file_operations *f_op; /* * Protects f_ep_links, f_flags. * Must not be taken from IRQ context. */ spinlock_t f_lock; enum rw_hint f_write_hint; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; struct mutex f_pos_lock; loff_t f_pos; struct fown_struct f_owner; const struct cred *f_cred; struct file_ra_state f_ra; u64 f_version; #ifdef CONFIG_SECURITY void *f_security; #endif /* needed for tty driver, and maybe others */ void *private_data; #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; errseq_t f_wb_err; errseq_t f_sb_err; /* for syncfs */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; int handle_type; /* file identifier */ unsigned char f_handle[]; }; static inline struct file *get_file(struct file *f) { atomic_long_inc(&f->f_count); return f; } #define get_file_rcu_many(x, cnt) \ atomic_long_add_unless(&(x)->f_count, (cnt), 0) #define get_file_rcu(x) get_file_rcu_many((x), 1) #define file_count(x) atomic_long_read(&(x)->f_count) #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes limits, otherwise bad things can happen in VM. */ #if BITS_PER_LONG==32 #define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT) #elif BITS_PER_LONG==64 #define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif #define FL_POSIX 1 #define FL_FLOCK 2 #define FL_DELEG 4 /* NFSv4 delegation */ #define FL_ACCESS 8 /* not trying to lock, just looking */ #define FL_EXISTS 16 /* when unlocking, test for existence */ #define FL_LEASE 32 /* lease held on this file */ #define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) /* * Special return value from posix_lock_file() and vfs_lock_file() for * asynchronous locking. */ #define FILE_LOCK_DEFERRED 1 /* legacy typedef, should eventually be removed */ typedef void *fl_owner_t; struct file_lock; struct file_lock_operations { void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); }; struct lock_manager_operations { fl_owner_t (*lm_get_owner)(fl_owner_t); void (*lm_put_owner)(fl_owner_t); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, int); bool (*lm_break)(struct file_lock *); int (*lm_change)(struct file_lock *, int, struct list_head *); void (*lm_setup)(struct file_lock *, void **); bool (*lm_breaker_owns_lease)(struct file_lock *); }; struct lock_manager { struct list_head list; /* * NFSv4 and up also want opens blocked during the grace period; * NLM doesn't care: */ bool block_opens; }; struct net; void locks_start_grace(struct net *, struct lock_manager *); void locks_end_grace(struct lock_manager *); bool locks_in_grace(struct net *); bool opens_in_grace(struct net *); /* that will die - we need it for nfs_lock_info */ #include <linux/nfs_fs_i.h> /* * struct file_lock represents a generic "file lock". It's used to represent * POSIX byte range locks, BSD (flock) locks, and leases. It's important to * note that the same struct is used to represent both a request for a lock and * the lock itself, but the same object is never used for both. * * FIXME: should we create a separate "struct lock_request" to help distinguish * these two uses? * * The varous i_flctx lists are ordered by: * * 1) lock owner * 2) lock range start * 3) lock range end * * Obviously, the last two criteria only matter for POSIX locks. */ struct file_lock { struct file_lock *fl_blocker; /* The lock, that is blocking us */ struct list_head fl_list; /* link into file_lock_context */ struct hlist_node fl_link; /* node in global lists */ struct list_head fl_blocked_requests; /* list of requests with * ->fl_blocker pointing here */ struct list_head fl_blocked_member; /* node in * ->fl_blocker->fl_blocked_requests */ fl_owner_t fl_owner; unsigned int fl_flags; unsigned char fl_type; unsigned int fl_pid; int fl_link_cpu; /* what cpu's list is this on? */ wait_queue_head_t fl_wait; struct file *fl_file; loff_t fl_start; loff_t fl_end; struct fasync_struct * fl_fasync; /* for lease break notifications */ /* for lease breaks: */ unsigned long fl_break_time; unsigned long fl_downgrade_time; const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ union { struct nfs_lock_info nfs_fl; struct nfs4_lock_info nfs4_fl; struct { struct list_head link; /* link in AFS vnode's pending_locks list */ int state; /* state of grant or error if -ve */ unsigned int debug_id; } afs; } fl_u; } __randomize_layout; struct file_lock_context { spinlock_t flc_lock; struct list_head flc_flock; struct list_head flc_posix; struct list_head flc_lease; }; /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) #define OFFSET_MAX INT_LIMIT(loff_t) #define OFFT_OFFSET_MAX INT_LIMIT(off_t) #endif extern void send_sigio(struct fown_struct *fown, int fd, int band); #define locks_inode(f) file_inode(f) #ifdef CONFIG_FILE_LOCKING extern int fcntl_getlk(struct file *, unsigned int, struct flock *); extern int fcntl_setlk(unsigned int, struct file *, unsigned int, struct flock *); #if BITS_PER_LONG == 32 extern int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, struct flock64 *); #endif extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); extern int fcntl_getlease(struct file *filp); /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); extern void locks_init_lock(struct file_lock *); extern struct file_lock * locks_alloc_lock(void); extern void locks_copy_lock(struct file_lock *, struct file_lock *); extern void locks_copy_conflock(struct file_lock *, struct file_lock *); extern void locks_remove_posix(struct file *, fl_owner_t); extern void locks_remove_file(struct file *); extern void locks_release_private(struct file_lock *); extern void posix_test_lock(struct file *, struct file_lock *); extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); extern int locks_delete_block(struct file_lock *); extern int vfs_test_lock(struct file *, struct file_lock *); extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); bool vfs_inode_has_locks(struct inode *inode); extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); extern void lease_get_mtime(struct inode *, struct timespec64 *time); extern int generic_setlease(struct file *, long, struct file_lock **, void **priv); extern int vfs_setlease(struct file *, long, struct file_lock **, void **); extern int lease_modify(struct file_lock *, int, struct list_head *); struct notifier_block; extern int lease_register_notifier(struct notifier_block *); extern void lease_unregister_notifier(struct notifier_block *); struct files_struct; extern void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) { return -EINVAL; } static inline int fcntl_setlk(unsigned int fd, struct file *file, unsigned int cmd, struct flock __user *user) { return -EACCES; } #if BITS_PER_LONG == 32 static inline int fcntl_getlk64(struct file *file, unsigned int cmd, struct flock64 __user *user) { return -EINVAL; } static inline int fcntl_setlk64(unsigned int fd, struct file *file, unsigned int cmd, struct flock64 __user *user) { return -EACCES; } #endif static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) { return -EINVAL; } static inline int fcntl_getlease(struct file *filp) { return F_UNLCK; } static inline void locks_free_lock_context(struct inode *inode) { } static inline void locks_init_lock(struct file_lock *fl) { return; } static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) { return; } static inline void locks_remove_file(struct file *filp) { return; } static inline void posix_test_lock(struct file *filp, struct file_lock *fl) { return; } static inline int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { return -ENOLCK; } static inline int locks_delete_block(struct file_lock *waiter) { return -ENOENT; } static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { return -ENOLCK; } static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline bool vfs_inode_has_locks(struct inode *inode) { return false; } static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) { return -ENOLCK; } static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { return 0; } static inline void lease_get_mtime(struct inode *inode, struct timespec64 *time) { return; } static inline int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { return -EINVAL; } static inline int vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { return -EINVAL; } static inline int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose) { return -EINVAL; } struct files_struct; static inline void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) {} #endif /* !CONFIG_FILE_LOCKING */ static inline struct inode *file_inode(const struct file *f) { return f->f_inode; } static inline struct dentry *file_dentry(const struct file *file) { return d_real(file->f_path.dentry, file_inode(file)); } static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) { return locks_lock_inode_wait(locks_inode(filp), fl); } struct fasync_struct { rwlock_t fa_lock; int magic; int fa_fd; struct fasync_struct *fa_next; /* singly linked list */ struct file *fa_file; struct rcu_head fa_rcu; }; #define FASYNC_MAGIC 0x4601 /* SMP safe fasync helpers: */ extern int fasync_helper(int, struct file *, int, struct fasync_struct **); extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *); extern int fasync_remove_entry(struct file *, struct fasync_struct **); extern struct fasync_struct *fasync_alloc(void); extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern int f_setown(struct file *filp, unsigned long arg, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct fown_struct *fown); /* * sb->s_flags. Note that these mirror the equivalent MS_* flags where * represented in both. */ #define SB_RDONLY BIT(0) /* Mount read-only */ #define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ #define SB_NODEV BIT(2) /* Disallow access to device special files */ #define SB_NOEXEC BIT(3) /* Disallow program execution */ #define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ #define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ #define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ #define SB_NOATIME BIT(10) /* Do not update access times. */ #define SB_NODIRATIME BIT(11) /* Do not update directory access times */ #define SB_SILENT BIT(15) #define SB_POSIXACL BIT(16) /* VFS does not apply the umask */ #define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ #define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ #define SB_I_VERSION BIT(23) /* Update inode I_version field */ #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ #define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) #define SB_NOSEC BIT(28) #define SB_BORN BIT(29) #define SB_ACTIVE BIT(30) #define SB_NOUSER BIT(31) /* These flags relate to encoding and casefolding */ #define SB_ENC_STRICT_MODE_FL (1 << 0) #define sb_has_strict_encoding(sb) \ (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) /* * Umount options */ #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ #define MNT_DETACH 0x00000002 /* Just detach from the tree */ #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop * internal threads if needed) */ SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ }; #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { int frozen; /* Is sb frozen? */ wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ unsigned char s_blocksize_bits; unsigned long s_blocksize; loff_t s_maxbytes; /* Max file size */ struct file_system_type *s_type; const struct super_operations *s_op; const struct dquot_operations *dq_op; const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; unsigned long s_iflags; /* internal SB_I_* flags */ unsigned long s_magic; struct dentry *s_root; struct rw_semaphore s_umount; int s_count; atomic_t s_active; #ifdef CONFIG_SECURITY void *s_security; #endif const struct xattr_handler **s_xattr; #ifdef CONFIG_FS_ENCRYPTION const struct fscrypt_operations *s_cop; struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ #endif #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif #ifdef CONFIG_UNICODE struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; unsigned int s_quota_types; /* Bitmask of supported quota types */ struct quota_info s_dquot; /* Diskquota specific options */ struct sb_writers s_writers; /* * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and * s_fsnotify_marks together for cache efficiency. They are frequently * accessed and rarely modified. */ void *s_fs_info; /* Filesystem private info */ /* Granularity of c/m/atime in ns (cannot be worse than a second) */ u32 s_time_gran; /* Time limits for c/m/atime in seconds */ time64_t s_time_min; time64_t s_time_max; #ifdef CONFIG_FSNOTIFY __u32 s_fsnotify_mask; struct fsnotify_mark_connector __rcu *s_fsnotify_marks; #endif char s_id[32]; /* Informational name */ uuid_t s_uuid; /* UUID */ unsigned int s_max_links; fmode_t s_mode; /* * The next field is for VFS *only*. No filesystems have any business * even looking at it. You had been warned. */ struct mutex s_vfs_rename_mutex; /* Kludge */ /* * Filesystem subtype. If non-empty the filesystem type field * in /proc/mounts will be "type.subtype" */ const char *s_subtype; const struct dentry_operations *s_d_op; /* default d_op for dentries */ /* * Saved pool identifier for cleancache (-1 means none) */ int cleancache_poolid; struct shrinker s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; /* Pending fsnotify inode refs */ atomic_long_t s_fsnotify_inode_refs; /* Being remounted read-only */ int s_readonly_remount; /* per-sb errseq_t for reporting writeback errors via syncfs */ errseq_t s_wb_err; /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; /* * Owning user namespace and default context in which to * interpret filesystem uids, gids, quotas, device nodes, * xattrs and security labels. */ struct user_namespace *s_user_ns; /* * The list_lru structure is essentially just a pointer to a table * of per-node lru lists, each of which has its own spinlock. * There is no need to put them into separate cachelines. */ struct list_lru s_dentry_lru; struct list_lru s_inode_lru; struct rcu_head rcu; struct work_struct destroy_work; struct mutex s_sync_lock; /* sync serialisation lock */ /* * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; /* s_inode_list_lock protects s_inodes */ spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; struct list_head s_inodes; /* all inodes */ spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can * instead deal with the raw numeric values that are stored * in the filesystem. */ static inline uid_t i_uid_read(const struct inode *inode) { return from_kuid(inode->i_sb->s_user_ns, inode->i_uid); } static inline gid_t i_gid_read(const struct inode *inode) { return from_kgid(inode->i_sb->s_user_ns, inode->i_gid); } static inline void i_uid_write(struct inode *inode, uid_t uid) { inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid); } static inline void i_gid_write(struct inode *inode, gid_t gid) { inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid); } struct timespec64 current_time(struct inode *inode); struct timespec64 inode_set_ctime_current(struct inode *inode); /** * inode_get_ctime - fetch the current ctime from the inode * @inode: inode from which to fetch ctime * * Grab the current ctime from the inode and return it. */ static inline struct timespec64 inode_get_ctime(const struct inode *inode) { return inode->i_ctime; } /** * inode_set_ctime_to_ts - set the ctime in the inode * @inode: inode in which to set the ctime * @ts: value to set in the ctime field * * Set the ctime in @inode to @ts */ static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) { inode->i_ctime = ts; return ts; } /** * inode_set_ctime - set the ctime in the inode * @inode: inode in which to set the ctime * @sec: tv_sec value to set * @nsec: tv_nsec value to set * * Set the ctime in @inode to { @sec, @nsec } */ static inline struct timespec64 inode_set_ctime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_ctime_to_ts(inode, ts); } /* * Snapshotting support. */ /* * These are internal functions, please use sb_start_{write,pagefault,intwrite} * instead. */ static inline void __sb_end_write(struct super_block *sb, int level) { percpu_up_read(sb->s_writers.rw_sem + level-1); } static inline void __sb_start_write(struct super_block *sb, int level) { percpu_down_read(sb->s_writers.rw_sem + level - 1); } static inline bool __sb_start_write_trylock(struct super_block *sb, int level) { return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); } #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to * * Decrement number of writers to the filesystem. Wake up possible waiters * wanting to freeze the filesystem. */ static inline void sb_end_write(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_WRITE); } /** * sb_end_pagefault - drop write access to a superblock from a page fault * @sb: the super we wrote to * * Decrement number of processes handling write page fault to the filesystem. * Wake up possible waiters wanting to freeze the filesystem. */ static inline void sb_end_pagefault(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_end_intwrite - drop write access to a superblock for internal fs purposes * @sb: the super we wrote to * * Decrement fs-internal number of writers to the filesystem. Wake up possible * waiters wanting to freeze the filesystem. */ static inline void sb_end_intwrite(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_FS); } /** * sb_start_write - get write access to a superblock * @sb: the super we write to * * When a process wants to write data or metadata to a file system (i.e. dirty * a page or an inode), it should embed the operation in a sb_start_write() - * sb_end_write() pair to get exclusion against file system freezing. This * function increments number of writers preventing freezing. If the file * system is already frozen, the function waits until the file system is * thawed. * * Since freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. Generally, * freeze protection should be the outermost lock. In particular, we have: * * sb_start_write * -> i_mutex (write path, truncate, directory ops, ...) * -> s_umount (freeze_super, thaw_super) */ static inline void sb_start_write(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_WRITE); } static inline bool sb_start_write_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); } /** * sb_start_pagefault - get write access to a superblock from a page fault * @sb: the super we write to * * When a process starts handling write page fault, it should embed the * operation into sb_start_pagefault() - sb_end_pagefault() pair to get * exclusion against file system freezing. This is needed since the page fault * is going to dirty a page. This function increments number of running page * faults preventing freezing. If the file system is already frozen, the * function waits until the file system is thawed. * * Since page fault freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. It is advised to * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault * handling code implies lock dependency: * * mmap_lock * -> sb_start_pagefault */ static inline void sb_start_pagefault(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_PAGEFAULT); } /* * sb_start_intwrite - get write access to a superblock for internal fs purposes * @sb: the super we write to * * This is the third level of protection against filesystem freezing. It is * free for use by a filesystem. The only requirement is that it must rank * below sb_start_pagefault. * * For example filesystem can call sb_start_intwrite() when starting a * transaction which somewhat eases handling of freezing for internal sources * of filesystem changes (internal fs threads, discarding preallocation on file * close, etc.). */ static inline void sb_start_intwrite(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_FS); } static inline bool sb_start_intwrite_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_FS); } extern bool inode_owner_or_capable(const struct inode *inode); /* * VFS helper functions.. */ extern int vfs_create(struct inode *, struct dentry *, umode_t, bool); extern int vfs_mkdir(struct inode *, struct dentry *, umode_t); extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); extern int vfs_symlink(struct inode *, struct dentry *, const char *); extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **); extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry) { return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); } extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag); int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), void *); int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define compat_ptr_ioctl NULL #endif /* * VFS file helper functions. */ extern void inode_init_owner(struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); umode_t mode_strip_sgid(const struct inode *dir, umode_t mode); /* * This is the "filldir" function type, used by readdir() to let * the kernel specify what kind of dirent layout it wants to have. * This allows the kernel to read directories into kernel space or * to have different dirent layouts depending on the binary type. */ struct dir_context; typedef int (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, unsigned); struct dir_context { filldir_t actor; loff_t pos; }; /* * These flags let !MMU mmap() govern direct device mapping vs immediate * copying more easily for MAP_PRIVATE, especially for ROM filesystems. * * NOMMU_MAP_COPY: Copy can be mapped (MAP_PRIVATE) * NOMMU_MAP_DIRECT: Can be mapped directly (MAP_SHARED) * NOMMU_MAP_READ: Can be mapped for reading * NOMMU_MAP_WRITE: Can be mapped for writing * NOMMU_MAP_EXEC: Can be mapped for execution */ #define NOMMU_MAP_COPY 0x00000001 #define NOMMU_MAP_DIRECT 0x00000008 #define NOMMU_MAP_READ VM_MAYREAD #define NOMMU_MAP_WRITE VM_MAYWRITE #define NOMMU_MAP_EXEC VM_MAYEXEC #define NOMMU_VMFLAGS \ (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC) /* * These flags control the behavior of the remap_file_range function pointer. * If it is called with len == 0 that means "remap to end of source file". * See Documentation/filesystems/vfs.rst for more details about this call. * * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request */ #define REMAP_FILE_DEDUP (1 << 0) #define REMAP_FILE_CAN_SHORTEN (1 << 1) /* * These flags signal that the caller is ok with altering various aspects of * the behavior of the remap operation. The changes must be made by the * implementation; the vfs remap helper functions can take advantage of them. * Flags in this category exist to preserve the quirky behavior of the hoisted * btrfs clone/dedupe ioctls. */ #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) /* * These flags control the behavior of vfs_copy_file_range(). * They are not available to the user via syscall. * * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops */ #define COPY_FILE_SPLICE (1 << 0) struct iov_iter; struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iopoll)(struct kiocb *kiocb, bool spin); int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); unsigned long mmap_supported_flags; int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); bool may_pollfree; } __randomize_layout; struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); int (*permission) (struct inode *, int); struct posix_acl * (*get_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); int (*create) (struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct inode *,struct dentry *,const char *); int (*mkdir) (struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, struct timespec64 *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); int (*tmpfile) (struct inode *, struct dentry *, umode_t); int (*set_acl)(struct inode *, struct posix_acl *, int); } ____cacheline_aligned; static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio, struct iov_iter *iter) { return file->f_op->read_iter(kio, iter); } static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio, struct iov_iter *iter) { return file->f_op->write_iter(kio, iter); } static inline int call_mmap(struct file *file, struct vm_area_struct *vma) { return file->f_op->mmap(file, vma); } extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags); extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); void (*free_inode)(struct inode *); void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc); int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_super) (struct super_block *); int (*freeze_fs) (struct super_block *); int (*thaw_super) (struct super_block *); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); struct dquot **(*get_dquots)(struct inode *); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); long (*nr_cached_objects)(struct super_block *, struct shrink_control *); long (*free_cached_objects)(struct super_block *, struct shrink_control *); }; /* * Inode flags - they have no relation to superblock flags now */ #define S_SYNC (1 << 0) /* Writes are synced at once */ #define S_NOATIME (1 << 1) /* Do not update access times */ #define S_APPEND (1 << 2) /* Append-only file */ #define S_IMMUTABLE (1 << 3) /* Immutable file */ #define S_DEAD (1 << 4) /* removed, but still open directory */ #define S_NOQUOTA (1 << 5) /* Inode is not counted to quota */ #define S_DIRSYNC (1 << 6) /* Directory modifications are synchronous */ #define S_NOCMTIME (1 << 7) /* Do not update file c/mtime */ #define S_SWAPFILE (1 << 8) /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE (1 << 9) /* Inode is fs-internal */ #define S_IMA (1 << 10) /* Inode has an associated IMA struct */ #define S_AUTOMOUNT (1 << 11) /* Automount/referral quasi-directory */ #define S_NOSEC (1 << 12) /* no suid or xattr security attributes */ #ifdef CONFIG_FS_DAX #define S_DAX (1 << 13) /* Direct Access, avoiding the page cache */ #else #define S_DAX 0 /* Make all the DAX code disappear */ #endif #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ /* * Note that nosuid etc flags are inode-specific: setting some file-system * flags just means all the inodes inherit those flags by default. It might be * possible to override it selectively if you really wanted to with some * ioctl() that is not currently implemented. * * Exception: SB_RDONLY is always applied to the entire file system. * * Unfortunately, it is possible to change a filesystems flags with it mounted * with files in use. This means that all of the inodes will not have their * i_flags updated. Hence, i_flags no longer inherit the superblock mount * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org */ #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; } #define IS_RDONLY(inode) sb_rdonly((inode)->i_sb) #define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \ ((inode)->i_flags & S_SYNC)) #define IS_DIRSYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \ ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) #define IS_MANDLOCK(inode) __IS_FLG(inode, SB_MANDLOCK) #define IS_NOATIME(inode) __IS_FLG(inode, SB_RDONLY|SB_NOATIME) #define IS_I_VERSION(inode) __IS_FLG(inode, SB_I_VERSION) #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) #define IS_POSIXACL(inode) __IS_FLG(inode, SB_POSIXACL) #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) #define IS_IMA(inode) ((inode)->i_flags & S_IMA) #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) #define IS_DAX(inode) ((inode)->i_flags & S_DAX) #define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED) #define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD) #define IS_VERITY(inode) ((inode)->i_flags & S_VERITY) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) static inline bool HAS_UNMAPPED_ID(struct inode *inode) { return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid); } static inline enum rw_hint file_write_hint(struct file *file) { if (file->f_write_hint != WRITE_LIFE_NOT_SET) return file->f_write_hint; return file_inode(file)->i_write_hint; } static inline int iocb_flags(struct file *file); static inline u16 ki_hint_validate(enum rw_hint hint) { typeof(((struct kiocb *)0)->ki_hint) max_hint = -1; if (hint <= max_hint) return hint; return 0; } static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = iocb_flags(filp), .ki_hint = ki_hint_validate(file_write_hint(filp)), .ki_ioprio = get_current_ioprio(), }; } static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = kiocb_src->ki_flags, .ki_hint = kiocb_src->ki_hint, .ki_ioprio = kiocb_src->ki_ioprio, .ki_pos = kiocb_src->ki_pos, }; } /* * Inode state bits. Protected by inode->i_lock * * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, * I_DIRTY_DATASYNC and I_DIRTY_PAGES. * * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at * various stages of removing an inode. * * Two bits are used for locking and completion notification, I_NEW and I_SYNC. * * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on * fdatasync(). i_atime is the usual cause. * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of * these changes separately from I_DIRTY_SYNC so that we * don't have to write inode on fdatasync() when only * mtime has changed in it. * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. * I_NEW Serves as both a mutex and completion notification. * New inodes set I_NEW. If two processes both create * the same inode, one of them will release its inode and * wait for I_NEW to be released before returning. * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can * also cause waiting on I_NEW, without I_NEW actually * being set. find_inode() uses this to prevent returning * nearly-dead inodes. * I_WILL_FREE Must be set when calling write_inode_now() if i_count * is zero. I_FREEING must be set when I_WILL_FREE is * cleared. * I_FREEING Set when inode is about to be freed but still has dirty * pages or buffers attached or the inode itself is still * dirty. * I_CLEAR Added by clear_inode(). In this state the inode is * clean and can be destroyed. Inode keeps I_FREEING. * * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are * prohibited for many purposes. iget() must wait for * the inode to be completely released, then create it * anew. Other functions will just ignore such inodes, * if appropriate. I_NEW is used for waiting. * * I_SYNC Writeback of inode is running. The bit is set during * data writeback, and cleared with a wakeup on the bit * address once it is done. The bit is also used to pin * the inode in memory for flusher thread. * * I_REFERENCED Marks the inode as recently references on the LRU list. * * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See * inode_switch_wbs_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper * and work dirs among overlayfs mounts. * * I_CREATING New object's inode in the middle of setting up. * * I_DONTCACHE Evict inode as soon as it is not used anymore. * * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. * Used to detect that mark_inode_dirty() should not move * inode between dirty lists. * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) #define I_DIRTY_DATASYNC (1 << 1) #define I_DIRTY_PAGES (1 << 2) #define __I_NEW 3 #define I_NEW (1 << __I_NEW) #define I_WILL_FREE (1 << 4) #define I_FREEING (1 << 5) #define I_CLEAR (1 << 6) #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) #define I_REFERENCED (1 << 8) #define __I_DIO_WAKEUP 9 #define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) #define I_WB_SWITCH (1 << 13) #define I_OVL_INUSE (1 << 14) #define I_CREATING (1 << 15) #define I_DONTCACHE (1 << 16) #define I_SYNC_QUEUED (1 << 17) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY); } static inline void mark_inode_dirty_sync(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY_SYNC); } extern void inc_nlink(struct inode *inode); extern void drop_nlink(struct inode *inode); extern void clear_nlink(struct inode *inode); extern void set_nlink(struct inode *inode, unsigned int nlink); static inline void inode_inc_link_count(struct inode *inode) { inc_nlink(inode); mark_inode_dirty(inode); } static inline void inode_dec_link_count(struct inode *inode) { drop_nlink(inode); mark_inode_dirty(inode); } enum file_time_flags { S_ATIME = 1, S_MTIME = 2, S_CTIME = 4, S_VERSION = 8, }; extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); int inode_update_time(struct inode *inode, struct timespec64 *time, int flags); static inline void file_accessed(struct file *file) { if (!(file->f_flags & O_NOATIME)) touch_atime(&file->f_path); } extern int file_modified(struct file *file); int sync_inode(struct inode *inode, struct writeback_control *wbc); int sync_inode_metadata(struct inode *inode, int wait); struct file_system_type { const char *name; int fs_flags; #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_THP_SUPPORT 8192 /* Remove once all fs converted */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; struct hlist_head fs_supers; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; struct lock_class_key s_vfs_rename_key; struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; }; #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_single(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); int set_anon_super_fc(struct super_block *s, struct fs_context *fc); int get_anon_bdev(dev_t *); void free_anon_bdev(dev_t); struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ #define fops_get(fops) \ (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) #define fops_put(fops) \ do { if (fops) module_put((fops)->owner); } while(0) /* * This one is to be used *ONLY* from ->open() instances. * fops must be non-NULL, pinned down *and* module dependencies * should be sufficient to pin the caller down as well. */ #define replace_fops(f, fops) \ do { \ struct file *__file = (f); \ fops_put(__file->f_op); \ BUG_ON(!(__file->f_op = (fops))); \ } while(0) extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); extern struct vfsmount *kern_mount(struct file_system_type *); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); extern long do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern struct vfsmount *collect_mounts(const struct path *); extern void drop_collected_mounts(struct vfsmount *); extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, struct vfsmount *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); extern bool our_mnt(struct vfsmount *mnt); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); extern int generic_update_time(struct inode *, struct timespec64 *, int); /* /sys/fs */ extern struct kobject *fs_kobj; #define MAX_RW_COUNT (INT_MAX & PAGE_MASK) #ifdef CONFIG_MANDATORY_FILE_LOCKING extern int locks_mandatory_locked(struct file *); extern int locks_mandatory_area(struct inode *, struct file *, loff_t, loff_t, unsigned char); /* * Candidates for mandatory locking have the setgid bit set * but no group execute bit - an otherwise meaningless combination. */ static inline int __mandatory_lock(struct inode *ino) { return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID; } /* * ... and these candidates should be on SB_MANDLOCK mounted fs, * otherwise these will be advisory locks */ static inline int mandatory_lock(struct inode *ino) { return IS_MANDLOCK(ino) && __mandatory_lock(ino); } static inline int locks_verify_locked(struct file *file) { if (mandatory_lock(locks_inode(file))) return locks_mandatory_locked(file); return 0; } static inline int locks_verify_truncate(struct inode *inode, struct file *f, loff_t size) { if (!inode->i_flctx || !mandatory_lock(inode)) return 0; if (size < inode->i_size) { return locks_mandatory_area(inode, f, size, inode->i_size - 1, F_WRLCK); } else { return locks_mandatory_area(inode, f, inode->i_size, size - 1, F_WRLCK); } } #else /* !CONFIG_MANDATORY_FILE_LOCKING */ static inline int locks_mandatory_locked(struct file *file) { return 0; } static inline int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start, loff_t end, unsigned char type) { return 0; } static inline int __mandatory_lock(struct inode *inode) { return 0; } static inline int mandatory_lock(struct inode *inode) { return 0; } static inline int locks_verify_locked(struct file *file) { return 0; } static inline int locks_verify_truncate(struct inode *inode, struct file *filp, size_t size) { return 0; } #endif /* CONFIG_MANDATORY_FILE_LOCKING */ #ifdef CONFIG_FILE_LOCKING static inline int break_lease(struct inode *inode, unsigned int mode) { /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ smp_mb(); if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) return __break_lease(inode, mode, FL_LEASE); return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ smp_mb(); if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) return __break_lease(inode, mode, FL_DELEG); return 0; } static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) { int ret; ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); if (ret == -EWOULDBLOCK && delegated_inode) { *delegated_inode = inode; ihold(inode); } return ret; } static inline int break_deleg_wait(struct inode **delegated_inode) { int ret; ret = break_deleg(*delegated_inode, O_WRONLY); iput(*delegated_inode); *delegated_inode = NULL; return ret; } static inline int break_layout(struct inode *inode, bool wait) { smp_mb(); if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) return __break_lease(inode, wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, FL_LAYOUT); return 0; } #else /* !CONFIG_FILE_LOCKING */ static inline int break_lease(struct inode *inode, unsigned int mode) { return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { return 0; } static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) { return 0; } static inline int break_deleg_wait(struct inode **delegated_inode) { BUG(); return 0; } static inline int break_layout(struct inode *inode, bool wait) { return 0; } #endif /* CONFIG_FILE_LOCKING */ /* fs/open.c */ struct audit_names; struct filename { const char *name; /* pointer to actual string */ const __user char *uptr; /* original userland pointer */ int refcnt; struct audit_names *aname; const char iname[]; }; static_assert(offsetof(struct filename, iname) % sizeof(long) == 0); extern long vfs_truncate(const struct path *, loff_t); extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int, umode_t); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern struct file * open_with_fake_path(const struct path *, int, struct inode*, const struct cred *); static inline struct file *file_clone_open(struct file *file) { return dentry_open(&file->f_path, file->f_flags, file->f_cred); } extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int, int *); extern struct filename *getname(const char __user *); extern struct filename *getname_kernel(const char *); extern void putname(struct filename *name); extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); /* fs/dcache.c */ extern void __init vfs_caches_init_early(void); extern void __init vfs_caches_init(void); extern struct kmem_cache *names_cachep; #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) extern struct super_block *blockdev_superblock; static inline bool sb_is_blkdev_sb(struct super_block *sb) { return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; } void emergency_thaw_all(void); extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; /* fs/char_dev.c */ #define CHRDEV_MAJOR_MAX 512 /* Marks the bottom of the first segment of free char majors */ #define CHRDEV_MAJOR_DYN_END 234 /* Marks the top and bottom of the second segment of free char majors */ #define CHRDEV_MAJOR_DYN_EXT_START 511 #define CHRDEV_MAJOR_DYN_EXT_END 384 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); extern int register_chrdev_region(dev_t, unsigned, const char *); extern int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops); extern void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name); extern void unregister_chrdev_region(dev_t, unsigned); extern void chrdev_show(struct seq_file *,off_t); static inline int register_chrdev(unsigned int major, const char *name, const struct file_operations *fops) { return __register_chrdev(major, 0, 256, name, fops); } static inline void unregister_chrdev(unsigned int major, const char *name) { __unregister_chrdev(major, 0, 256, name); } extern void init_special_inode(struct inode *, umode_t, dev_t); /* Invalid inode operations -- fs/bad_inode.c */ extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); void invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec); static inline void invalidate_remote_inode(struct inode *inode) { if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) invalidate_mapping_pages(inode->i_mapping, 0, -1); } extern int invalidate_inode_pages2(struct address_space *mapping); extern int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); extern int write_inode_now(struct inode *, int); extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait_keep_errors(struct address_space *mapping); extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); extern int filemap_fdatawait_range_keep_errors(struct address_space *mapping, loff_t start_byte, loff_t end_byte); static inline int filemap_fdatawait(struct address_space *mapping) { return filemap_fdatawait_range(mapping, 0, LLONG_MAX); } extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); extern int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode); extern int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); extern int filemap_check_errors(struct address_space *mapping); extern void __filemap_set_wb_err(struct address_space *mapping, int err); static inline int filemap_write_and_wait(struct address_space *mapping) { return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); } extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, loff_t lend); extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); static inline int file_write_and_wait(struct file *file) { return file_write_and_wait_range(file, 0, LLONG_MAX); } /** * filemap_set_wb_err - set a writeback error on an address_space * @mapping: mapping in which to set writeback error * @err: error to be set in mapping * * When writeback fails in some way, we must record that error so that * userspace can be informed when fsync and the like are called. We endeavor * to report errors on any file that was open at the time of the error. Some * internal callers also need to know when writeback errors have occurred. * * When a writeback error occurs, most filesystems will want to call * filemap_set_wb_err to record the error in the mapping so that it will be * automatically reported whenever fsync is called on the file. */ static inline void filemap_set_wb_err(struct address_space *mapping, int err) { /* Fastpath for common case of no error */ if (unlikely(err)) __filemap_set_wb_err(mapping, err); } /** * filemap_check_wb_err - has an error occurred since the mark was sampled? * @mapping: mapping to check for writeback errors * @since: previously-sampled errseq_t * * Grab the errseq_t value from the mapping, and see if it has changed "since" * the given value was sampled. * * If it has then report the latest error set, otherwise return 0. */ static inline int filemap_check_wb_err(struct address_space *mapping, errseq_t since) { return errseq_check(&mapping->wb_err, since); } /** * filemap_sample_wb_err - sample the current errseq_t to test for later errors * @mapping: mapping to be sampled * * Writeback errors are always reported relative to a particular sample point * in the past. This function provides those sample points. */ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) { return errseq_sample(&mapping->wb_err); } /** * file_sample_sb_err - sample the current errseq_t to test for later errors * @file: file pointer to be sampled * * Grab the most current superblock-level errseq_t value for the given * struct file. */ static inline errseq_t file_sample_sb_err(struct file *file) { return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); } extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, unsigned int flags); /* * Sync the bytes written if this was a synchronous write. Expect ki_pos * to already be updated for the write, and will return either the amount * of bytes passed in, or an error if syncing the file failed. */ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) { if (iocb->ki_flags & IOCB_DSYNC) { int ret = vfs_fsync_range(iocb->ki_filp, iocb->ki_pos - count, iocb->ki_pos - 1, (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret; } return count; } extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK extern int bmap(struct inode *inode, sector_t *block); #else static inline int bmap(struct inode *inode, sector_t *block) { return -EINVAL; } #endif extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); extern int __check_sticky(struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) { return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); } static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) { return (inode->i_mode ^ mode) & S_IFMT; } static inline void file_start_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; sb_start_write(file_inode(file)->i_sb); } static inline bool file_start_write_trylock(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_start_write_trylock(file_inode(file)->i_sb); } static inline void file_end_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; __sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE); } /* * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. * This is used for regular files. * We cannot support write (and maybe mmap read-write shared) accesses and * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode * can have the following values: * 0: no writers, no VM_DENYWRITE mappings * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist * > 0: (i_writecount) users are writing to the file. * * Normally we operate on that counter with atomic_{inc,dec} and it's safe * except for the cases where we don't hold i_writecount yet. Then we need to * use {get,deny}_write_access() - these functions check the sign and refuse * to do the change if sign is wrong. */ static inline int get_write_access(struct inode *inode) { return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline int deny_write_access(struct file *file) { struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); } static inline void allow_write_access(struct file *file) { if (file) atomic_inc(&file_inode(file)->i_writecount); } static inline bool inode_is_open_for_write(const struct inode *inode) { return atomic_read(&inode->i_writecount) > 0; } #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) static inline void i_readcount_dec(struct inode *inode) { BUG_ON(!atomic_read(&inode->i_readcount)); atomic_dec(&inode->i_readcount); } static inline void i_readcount_inc(struct inode *inode) { atomic_inc(&inode->i_readcount); } #else static inline void i_readcount_dec(struct inode *inode) { return; } static inline void i_readcount_inc(struct inode *inode) { return; } #endif extern int do_pipe_flags(int *, int); extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *); ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos); extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *); extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *); extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ extern bool is_subdir(struct dentry *, struct dentry *); extern bool path_is_under(const struct path *, const struct path *); extern char *file_path(struct file *, char *, int); #include <linux/err.h> /* needed for stackable file system support */ extern loff_t default_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); extern int inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); extern int generic_delete_inode(struct inode *inode); static inline int generic_drop_inode(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } extern void d_mark_dontcache(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup(struct super_block *sb, unsigned long ino); extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); extern struct inode * iget_locked(struct super_block *, unsigned long); extern struct inode *find_inode_nowait(struct super_block *, unsigned long, int (*match)(struct inode *, unsigned long, void *), void *data); extern struct inode *find_inode_rcu(struct super_block *, unsigned long, int (*)(struct inode *, void *), void *); extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void lockdep_annotate_inode_mutex_key(struct inode *inode); #else static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif extern void unlock_new_inode(struct inode *); extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb); /* * Userspace may rely on the the inode number being non-zero. For example, glibc * simply ignores files with zero i_ino in unlink() and other places. * * As an additional complication, if userspace was compiled with * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the * lower 32 bits, so we need to check that those aren't zero explicitly. With * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but * better safe than sorry. */ static inline bool is_zero_ino(ino_t ino) { return (u32)ino == 0; } extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct inode *); extern int file_remove_privs(struct file *); extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } extern void inode_sb_list_add(struct inode *inode); extern int sb_set_blocksize(struct super_block *, int); extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); extern ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *to, ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter); ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter); /* fs/block_dev.c */ extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to); extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from); extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync); extern void block_sync_page(struct page *page); /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *, size_t len, unsigned int flags); extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags); extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t noop_llseek(struct file *file, loff_t offset, int whence); extern loff_t no_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof); extern loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size); extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); extern loff_t no_seek_end_llseek(struct file *, loff_t, int); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); extern int stream_open(struct inode * inode, struct file * filp); #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(struct bio *bio, struct inode *inode, loff_t file_offset); enum { /* need locking between buffered and direct access */ DIO_LOCKING = 0x01, /* filesystem does not support filling holes */ DIO_SKIP_HOLES = 0x02, }; ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags); static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct iov_iter *iter, get_block_t get_block) { return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } #endif void inode_dio_wait(struct inode *inode); /* * inode_dio_begin - signal start of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_begin(struct inode *inode) { atomic_inc(&inode->i_dio_count); } /* * inode_dio_end - signal finish of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_end(struct inode *inode) { if (atomic_dec_and_test(&inode->i_dio_count)) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } /* * Warn about a page cache invalidation failure diring a direct I/O write. */ void dio_warn_stale_pagecache(struct file *filp); extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) extern int readlink_copy(char __user *, int, const char *); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); extern int __page_symlink(struct inode *inode, const char *symname, int len, int nofs); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); extern void generic_fillattr(struct inode *, struct kstat *); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); void inode_add_bytes(struct inode *inode, loff_t bytes); void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); static inline loff_t __inode_get_bytes(struct inode *inode) { return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; } loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); const char *simple_get_link(struct dentry *, struct inode *, struct delayed_call *); extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags); int vfs_fstat(int fd, struct kstat *stat); static inline int vfs_stat(const char __user *filename, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, filename, stat, 0); } static inline int vfs_lstat(const char __user *name, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); } extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern struct super_block *get_super(struct block_device *); extern struct super_block *get_super_thawed(struct block_device *); extern struct super_block *get_super_exclusive_thawed(struct block_device *bdev); extern struct super_block *get_active_super(struct block_device *bdev); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, struct dir_context *); extern int simple_setattr(struct dentry *, struct iattr *); extern int simple_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); extern int noop_set_page_dirty(struct page *page); extern void noop_invalidatepage(struct page *page, unsigned int offset, unsigned int length); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); extern int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); extern int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); extern int simple_nosetlease(struct file *, long, struct file_lock **, void **); extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); extern const struct file_operations simple_dir_operations; extern const struct inode_operations simple_dir_inode_operations; extern void make_empty_dir_inode(struct inode *inode); extern bool is_empty_dir_inode(struct inode *inode); struct tree_descr { const char *name; const struct file_operations *ops; int mode; }; struct dentry *d_alloc_name(struct dentry *, const char *); extern int simple_fill_super(struct super_block *, unsigned long, const struct tree_descr *); extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); extern int __generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); #ifdef CONFIG_UNICODE extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #endif #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int buffer_migrate_page_norefs(struct address_space *, struct page *, struct page *, enum migrate_mode); #else #define buffer_migrate_page NULL #define buffer_migrate_page_norefs NULL #endif extern int setattr_prepare(struct dentry *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); extern void setattr_copy(struct inode *inode, const struct iattr *attr); extern int file_update_time(struct file *file); static inline bool vma_is_dax(const struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); } static inline bool vma_is_fsdax(struct vm_area_struct *vma) { struct inode *inode; if (!vma->vm_file) return false; if (!vma_is_dax(vma)) return false; inode = file_inode(vma->vm_file); if (S_ISCHR(inode->i_mode)) return false; /* device-dax */ return true; } static inline int iocb_flags(struct file *file) { int res = 0; if (file->f_flags & O_APPEND) res |= IOCB_APPEND; if (file->f_flags & O_DIRECT) res |= IOCB_DIRECT; if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host)) res |= IOCB_DSYNC; if (file->f_flags & __O_SYNC) res |= IOCB_SYNC; return res; } static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) { int kiocb_flags = 0; /* make sure there's no overlap between RWF and private IOCB flags */ BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD); if (!flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; kiocb_flags |= IOCB_NOIO; } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; ki->ki_flags |= kiocb_flags; return 0; } static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; /* * Don't strictly need d_lock here? If the parent ino could change * then surely we'd have a deeper race in the caller? */ spin_lock(&dentry->d_lock); res = dentry->d_parent->d_inode->i_ino; spin_unlock(&dentry->d_lock); return res; } /* Transaction based IO helpers */ /* * An argresp is stored in an allocated page and holds the * size of the argument or response, along with its content */ struct simple_transaction_argresp { ssize_t size; char data[]; }; #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) char *simple_transaction_get(struct file *file, const char __user *buf, size_t size); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos); int simple_transaction_release(struct inode *inode, struct file *file); void simple_transaction_set(struct file *file, size_t n); /* * simple attribute files * * These attributes behave similar to those in sysfs: * * Writing to an attribute immediately sets a value, an open file can be * written to multiple times. * * Reading from an attribute creates a buffer from the value that might get * read with multiple read calls. When the attribute has been read * completely, no further read calls are possible until the file is opened * again. * * All attributes contain a text representation of a numeric value * that are accessed with the get() and set() functions. */ #define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ return simple_attr_open(inode, file, __get, __set, __fmt); \ } \ static const struct file_operations __fops = { \ .owner = THIS_MODULE, \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = simple_attr_read, \ .write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \ .llseek = generic_file_llseek, \ } #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) #define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) { /* don't do anything, just let the compiler check the arguments; */ } int simple_attr_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt); int simple_attr_release(struct inode *inode, struct file *file); ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_inodes(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); #define __FMODE_EXEC ((__force int) FMODE_EXEC) #define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) #define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ (flag & __FMODE_NONOTIFY))) static inline bool is_sxid(umode_t mode) { return mode & (S_ISUID | S_ISGID); } static inline int check_sticky(struct inode *dir, struct inode *inode) { if (!(dir->i_mode & S_ISVTX)) return 0; return __check_sticky(dir, inode); } static inline void inode_has_no_xattr(struct inode *inode) { if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC)) inode->i_flags |= S_NOSEC; } static inline bool is_root_inode(struct inode *inode) { return inode == inode->i_sb->s_root->d_inode; } static inline bool dir_emit(struct dir_context *ctx, const char *name, int namelen, u64 ino, unsigned type) { return ctx->actor(ctx, name, namelen, ctx->pos, ino, type) == 0; } static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, ".", 1, ctx->pos, file->f_path.dentry->d_inode->i_ino, DT_DIR) == 0; } static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, "..", 2, ctx->pos, parent_ino(file->f_path.dentry), DT_DIR) == 0; } static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) { if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) return false; ctx->pos = 1; } if (ctx->pos == 1) { if (!dir_emit_dotdot(file, ctx)) return false; ctx->pos = 2; } return true; } static inline bool dir_relax(struct inode *inode) { inode_unlock(inode); inode_lock(inode); return !IS_DEADDIR(inode); } static inline bool dir_relax_shared(struct inode *inode) { inode_unlock_shared(inode); inode_lock_shared(inode); return !IS_DEADDIR(inode); } extern bool path_noexec(const struct path *path); extern void inode_nohighmem(struct inode *inode); /* mm/fadvise.c */ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice); extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags, unsigned int flags); int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa, struct fsxattr *fa); static inline void simple_fill_fsxattr(struct fsxattr *fa, __u32 xflags) { memset(fa, 0, sizeof(*fa)); fa->fsx_xflags = xflags; } /* * Flush file data before changing attributes. Caller must hold any locks * required to prevent further writes to this file until we're done setting * flags. */ static inline int inode_drain_writes(struct inode *inode) { inode_dio_wait(inode); return filemap_write_and_wait(inode->i_mapping); } #endif /* _LINUX_FS_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles * or rs-channels. It also implements echoing, cooked mode etc. * * Kill-line thanks to John T Kohl, who also corrected VMIN = VTIME = 0. * * Modified by Theodore Ts'o, 9/14/92, to dynamically allocate the * tty_struct and tty_queue structures. Previously there was an array * of 256 tty_struct's which was statically allocated, and the * tty_queue structures were allocated at boot time. Both are now * dynamically allocated only when the tty is open. * * Also restructured routines so that there is more of a separation * between the high-level tty routines (tty_io.c and tty_ioctl.c) and * the low-level tty routines (serial.c, pty.c, console.c). This * makes for cleaner and more compact code. -TYT, 9/17/92 * * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines * which can be dynamically activated and de-activated by the line * discipline handling modules (like SLIP). * * NOTE: pay no attention to the line discipline code (yet); its * interface is still subject to change in this version... * -- TYT, 1/31/92 * * Added functionality to the OPOST tty handling. No delays, but all * other bits should be there. * -- Nick Holloway <alfie@dcs.warwick.ac.uk>, 27th May 1993. * * Rewrote canonical mode and added more termios flags. * -- julian@uhunix.uhcc.hawaii.edu (J. Cowley), 13Jan94 * * Reorganized FASYNC support so mouse code can share it. * -- ctm@ardi.com, 9Sep95 * * New TIOCLINUX variants added. * -- mj@k332.feld.cvut.cz, 19-Nov-95 * * Restrict vt switching via ioctl() * -- grif@cs.ucr.edu, 5-Dec-95 * * Move console and virtual terminal code to more appropriate files, * implement CONFIG_VT and generalize console device interface. * -- Marko Kohtala <Marko.Kohtala@hut.fi>, March 97 * * Rewrote tty_init_dev and tty_release_dev to eliminate races. * -- Bill Hawes <whawes@star.net>, June 97 * * Added devfs support. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 13-Jan-1998 * * Added support for a Unix98-style ptmx device. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 * * Reduced memory usage for older ARM systems * -- Russell King <rmk@arm.linux.org.uk> * * Move do_SAK() into process context. Less stack use in devfs functions. * alloc_tty_struct() always uses kmalloc() * -- Andrew Morton <andrewm@uow.edu.eu> 17Mar01 */ #include <linux/types.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/fcntl.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/devpts_fs.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/console.h> #include <linux/timer.h> #include <linux/ctype.h> #include <linux/kd.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/ppp-ioctl.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/device.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/seq_file.h> #include <linux/serial.h> #include <linux/ratelimit.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/kbd_kern.h> #include <linux/vt_kern.h> #include <linux/selection.h> #include <linux/kmod.h> #include <linux/nsproxy.h> #include "tty.h" #undef TTY_DEBUG_HANGUP #ifdef TTY_DEBUG_HANGUP # define tty_debug_hangup(tty, f, args...) tty_debug(tty, f, ##args) #else # define tty_debug_hangup(tty, f, args...) do { } while (0) #endif #define TTY_PARANOIA_CHECK 1 #define CHECK_TTY_COUNT 1 struct ktermios tty_std_termios = { /* for the benefit of tty drivers */ .c_iflag = ICRNL | IXON, .c_oflag = OPOST | ONLCR, .c_cflag = B38400 | CS8 | CREAD | HUPCL, .c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN, .c_cc = INIT_C_CC, .c_ispeed = 38400, .c_ospeed = 38400, /* .c_line = N_TTY, */ }; EXPORT_SYMBOL(tty_std_termios); /* This list gets poked at by procfs and various bits of boot up code. This could do with some rationalisation such as pulling the tty proc function into this file */ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ /* Mutex to protect creating and releasing a tty */ DEFINE_MUTEX(tty_mutex); static ssize_t tty_read(struct kiocb *, struct iov_iter *); static ssize_t tty_write(struct kiocb *, struct iov_iter *); static __poll_t tty_poll(struct file *, poll_table *); static int tty_open(struct inode *, struct file *); #ifdef CONFIG_COMPAT static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define tty_compat_ioctl NULL #endif static int __tty_fasync(int fd, struct file *filp, int on); static int tty_fasync(int fd, struct file *filp, int on); static void release_tty(struct tty_struct *tty, int idx); /** * free_tty_struct - free a disused tty * @tty: tty struct to free * * Free the write buffers, tty queue and tty memory itself. * * Locking: none. Must be called after tty is definitely unused */ static void free_tty_struct(struct tty_struct *tty) { tty_ldisc_deinit(tty); put_device(tty->dev); kfree(tty->write_buf); tty->magic = 0xDEADDEAD; kfree(tty); } static inline struct tty_struct *file_tty(struct file *file) { return ((struct tty_file_private *)file->private_data)->tty; } int tty_alloc_file(struct file *file) { struct tty_file_private *priv; priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; file->private_data = priv; return 0; } /* Associate a new file with the tty structure */ void tty_add_file(struct tty_struct *tty, struct file *file) { struct tty_file_private *priv = file->private_data; priv->tty = tty; priv->file = file; spin_lock(&tty->files_lock); list_add(&priv->list, &tty->tty_files); spin_unlock(&tty->files_lock); } /** * tty_free_file - free file->private_data * * This shall be used only for fail path handling when tty_add_file was not * called yet. */ void tty_free_file(struct file *file) { struct tty_file_private *priv = file->private_data; file->private_data = NULL; kfree(priv); } /* Delete file from its tty */ static void tty_del_file(struct file *file) { struct tty_file_private *priv = file->private_data; struct tty_struct *tty = priv->tty; spin_lock(&tty->files_lock); list_del(&priv->list); spin_unlock(&tty->files_lock); tty_free_file(file); } /** * tty_name - return tty naming * @tty: tty structure * * Convert a tty structure into a name. The name reflects the kernel * naming policy and if udev is in use may not reflect user space * * Locking: none */ const char *tty_name(const struct tty_struct *tty) { if (!tty) /* Hmm. NULL pointer. That's fun. */ return "NULL tty"; return tty->name; } EXPORT_SYMBOL(tty_name); const char *tty_driver_name(const struct tty_struct *tty) { if (!tty || !tty->driver) return ""; return tty->driver->name; } static int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine) { #ifdef TTY_PARANOIA_CHECK if (!tty) { pr_warn("(%d:%d): %s: NULL tty\n", imajor(inode), iminor(inode), routine); return 1; } if (tty->magic != TTY_MAGIC) { pr_warn("(%d:%d): %s: bad magic number\n", imajor(inode), iminor(inode), routine); return 1; } #endif return 0; } /* Caller must hold tty_lock */ static int check_tty_count(struct tty_struct *tty, const char *routine) { #ifdef CHECK_TTY_COUNT struct list_head *p; int count = 0, kopen_count = 0; spin_lock(&tty->files_lock); list_for_each(p, &tty->tty_files) { count++; } spin_unlock(&tty->files_lock); if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_SLAVE && tty->link && tty->link->count) count++; if (tty_port_kopened(tty->port)) kopen_count++; if (tty->count != (count + kopen_count)) { tty_warn(tty, "%s: tty->count(%d) != (#fd's(%d) + #kopen's(%d))\n", routine, tty->count, count, kopen_count); return (count + kopen_count); } #endif return 0; } /** * get_tty_driver - find device of a tty * @device: device identifier * @index: returns the index of the tty * * This routine returns a tty driver structure, given a device number * and also passes back the index number. * * Locking: caller must hold tty_mutex */ static struct tty_driver *get_tty_driver(dev_t device, int *index) { struct tty_driver *p; list_for_each_entry(p, &tty_drivers, tty_drivers) { dev_t base = MKDEV(p->major, p->minor_start); if (device < base || device >= base + p->num) continue; *index = device - base; return tty_driver_kref_get(p); } return NULL; } /** * tty_dev_name_to_number - return dev_t for device name * @name: user space name of device under /dev * @number: pointer to dev_t that this function will populate * * This function converts device names like ttyS0 or ttyUSB1 into dev_t * like (4, 64) or (188, 1). If no corresponding driver is registered then * the function returns -ENODEV. * * Locking: this acquires tty_mutex to protect the tty_drivers list from * being modified while we are traversing it, and makes sure to * release it before exiting. */ int tty_dev_name_to_number(const char *name, dev_t *number) { struct tty_driver *p; int ret; int index, prefix_length = 0; const char *str; for (str = name; *str && !isdigit(*str); str++) ; if (!*str) return -EINVAL; ret = kstrtoint(str, 10, &index); if (ret) return ret; prefix_length = str - name; mutex_lock(&tty_mutex); list_for_each_entry(p, &tty_drivers, tty_drivers) if (prefix_length == strlen(p->name) && strncmp(name, p->name, prefix_length) == 0) { if (index < p->num) { *number = MKDEV(p->major, p->minor_start + index); goto out; } } /* if here then driver wasn't found */ ret = -ENODEV; out: mutex_unlock(&tty_mutex); return ret; } EXPORT_SYMBOL_GPL(tty_dev_name_to_number); #ifdef CONFIG_CONSOLE_POLL /** * tty_find_polling_driver - find device of a polled tty * @name: name string to match * @line: pointer to resulting tty line nr * * This routine returns a tty driver structure, given a name * and the condition that the tty driver is capable of polled * operation. */ struct tty_driver *tty_find_polling_driver(char *name, int *line) { struct tty_driver *p, *res = NULL; int tty_line = 0; int len; char *str, *stp; for (str = name; *str; str++) if ((*str >= '0' && *str <= '9') || *str == ',') break; if (!*str) return NULL; len = str - name; tty_line = simple_strtoul(str, &str, 10); mutex_lock(&tty_mutex); /* Search through the tty devices to look for a match */ list_for_each_entry(p, &tty_drivers, tty_drivers) { if (!len || strncmp(name, p->name, len) != 0) continue; stp = str; if (*stp == ',') stp++; if (*stp == '\0') stp = NULL; if (tty_line >= 0 && tty_line < p->num && p->ops && p->ops->poll_init && !p->ops->poll_init(p, tty_line, stp)) { res = tty_driver_kref_get(p); *line = tty_line; break; } } mutex_unlock(&tty_mutex); return res; } EXPORT_SYMBOL_GPL(tty_find_polling_driver); #endif static ssize_t hung_up_tty_read(struct kiocb *iocb, struct iov_iter *to) { return 0; } static ssize_t hung_up_tty_write(struct kiocb *iocb, struct iov_iter *from) { return -EIO; } /* No kernel lock held - none needed ;) */ static __poll_t hung_up_tty_poll(struct file *filp, poll_table *wait) { return EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM | EPOLLWRNORM; } static long hung_up_tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static long hung_up_tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static int hung_up_tty_fasync(int fd, struct file *file, int on) { return -ENOTTY; } static void tty_show_fdinfo(struct seq_file *m, struct file *file) { struct tty_struct *tty = file_tty(file); if (tty && tty->ops && tty->ops->show_fdinfo) tty->ops->show_fdinfo(tty, m); } static const struct file_operations tty_fops = { .llseek = no_llseek, .read_iter = tty_read, .write_iter = tty_write, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, .show_fdinfo = tty_show_fdinfo, }; static const struct file_operations console_fops = { .llseek = no_llseek, .read_iter = tty_read, .write_iter = redirected_tty_write, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, }; static const struct file_operations hung_up_tty_fops = { .llseek = no_llseek, .read_iter = hung_up_tty_read, .write_iter = hung_up_tty_write, .poll = hung_up_tty_poll, .unlocked_ioctl = hung_up_tty_ioctl, .compat_ioctl = hung_up_tty_compat_ioctl, .release = tty_release, .fasync = hung_up_tty_fasync, }; static DEFINE_SPINLOCK(redirect_lock); static struct file *redirect; extern void tty_sysctl_init(void); /** * tty_wakeup - request more data * @tty: terminal * * Internal and external helper for wakeups of tty. This function * informs the line discipline if present that the driver is ready * to receive more output data. */ void tty_wakeup(struct tty_struct *tty) { struct tty_ldisc *ld; if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) { ld = tty_ldisc_ref(tty); if (ld) { if (ld->ops->write_wakeup) ld->ops->write_wakeup(tty); tty_ldisc_deref(ld); } } wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } EXPORT_SYMBOL_GPL(tty_wakeup); /** * __tty_hangup - actual handler for hangup events * @tty: tty device * * This can be called by a "kworker" kernel thread. That is process * synchronous but doesn't hold any locks, so we need to make sure we * have the appropriate locks for what we're doing. * * The hangup event clears any pending redirections onto the hung up * device. It ensures future writes will error and it does the needed * line discipline hangup and signal delivery. The tty object itself * remains intact. * * Locking: * BTM * redirect lock for undoing redirection * file list lock for manipulating list of ttys * tty_ldiscs_lock from called functions * termios_rwsem resetting termios data * tasklist_lock to walk task list for hangup event * ->siglock to protect ->signal/->sighand */ static void __tty_hangup(struct tty_struct *tty, int exit_session) { struct file *cons_filp = NULL; struct file *filp, *f = NULL; struct tty_file_private *priv; int closecount = 0, n; int refs; if (!tty) return; spin_lock(&redirect_lock); if (redirect && file_tty(redirect) == tty) { f = redirect; redirect = NULL; } spin_unlock(&redirect_lock); tty_lock(tty); if (test_bit(TTY_HUPPED, &tty->flags)) { tty_unlock(tty); return; } /* * Some console devices aren't actually hung up for technical and * historical reasons, which can lead to indefinite interruptible * sleep in n_tty_read(). The following explicitly tells * n_tty_read() to abort readers. */ set_bit(TTY_HUPPING, &tty->flags); /* inuse_filps is protected by the single tty lock, this really needs to change if we want to flush the workqueue with the lock held */ check_tty_count(tty, "tty_hangup"); spin_lock(&tty->files_lock); /* This breaks for file handles being sent over AF_UNIX sockets ? */ list_for_each_entry(priv, &tty->tty_files, list) { filp = priv->file; if (filp->f_op->write_iter == redirected_tty_write) cons_filp = filp; if (filp->f_op->write_iter != tty_write) continue; closecount++; __tty_fasync(-1, filp, 0); /* can't block */ filp->f_op = &hung_up_tty_fops; } spin_unlock(&tty->files_lock); refs = tty_signal_session_leader(tty, exit_session); /* Account for the p->signal references we killed */ while (refs--) tty_kref_put(tty); tty_ldisc_hangup(tty, cons_filp != NULL); spin_lock_irq(&tty->ctrl_lock); clear_bit(TTY_THROTTLED, &tty->flags); clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); put_pid(tty->session); put_pid(tty->pgrp); tty->session = NULL; tty->pgrp = NULL; tty->ctrl_status = 0; spin_unlock_irq(&tty->ctrl_lock); /* * If one of the devices matches a console pointer, we * cannot just call hangup() because that will cause * tty->count and state->count to go out of sync. * So we just call close() the right number of times. */ if (cons_filp) { if (tty->ops->close) for (n = 0; n < closecount; n++) tty->ops->close(tty, cons_filp); } else if (tty->ops->hangup) tty->ops->hangup(tty); /* * We don't want to have driver/ldisc interactions beyond the ones * we did here. The driver layer expects no calls after ->hangup() * from the ldisc side, which is now guaranteed. */ set_bit(TTY_HUPPED, &tty->flags); clear_bit(TTY_HUPPING, &tty->flags); tty_unlock(tty); if (f) fput(f); } static void do_tty_hangup(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); __tty_hangup(tty, 0); } /** * tty_hangup - trigger a hangup event * @tty: tty to hangup * * A carrier loss (virtual or otherwise) has occurred on this like * schedule a hangup sequence to run after this event. */ void tty_hangup(struct tty_struct *tty) { tty_debug_hangup(tty, "hangup\n"); schedule_work(&tty->hangup_work); } EXPORT_SYMBOL(tty_hangup); /** * tty_vhangup - process vhangup * @tty: tty to hangup * * The user has asked via system call for the terminal to be hung up. * We do this synchronously so that when the syscall returns the process * is complete. That guarantee is necessary for security reasons. */ void tty_vhangup(struct tty_struct *tty) { tty_debug_hangup(tty, "vhangup\n"); __tty_hangup(tty, 0); } EXPORT_SYMBOL(tty_vhangup); /** * tty_vhangup_self - process vhangup for own ctty * * Perform a vhangup on the current controlling tty */ void tty_vhangup_self(void) { struct tty_struct *tty; tty = get_current_tty(); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } } /** * tty_vhangup_session - hangup session leader exit * @tty: tty to hangup * * The session leader is exiting and hanging up its controlling terminal. * Every process in the foreground process group is signalled SIGHUP. * * We do this synchronously so that when the syscall returns the process * is complete. That guarantee is necessary for security reasons. */ void tty_vhangup_session(struct tty_struct *tty) { tty_debug_hangup(tty, "session hangup\n"); __tty_hangup(tty, 1); } /** * tty_hung_up_p - was tty hung up * @filp: file pointer of tty * * Return true if the tty has been subject to a vhangup or a carrier * loss */ int tty_hung_up_p(struct file *filp) { return (filp && filp->f_op == &hung_up_tty_fops); } EXPORT_SYMBOL(tty_hung_up_p); /** * stop_tty - propagate flow control * @tty: tty to stop * * Perform flow control to the driver. May be called * on an already stopped device and will not re-call the driver * method. * * This functionality is used by both the line disciplines for * halting incoming flow and by the driver. It may therefore be * called from any context, may be under the tty atomic_write_lock * but not always. * * Locking: * flow_lock */ void __stop_tty(struct tty_struct *tty) { if (tty->stopped) return; tty->stopped = 1; if (tty->ops->stop) tty->ops->stop(tty); } void stop_tty(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->flow_lock, flags); __stop_tty(tty); spin_unlock_irqrestore(&tty->flow_lock, flags); } EXPORT_SYMBOL(stop_tty); /** * start_tty - propagate flow control * @tty: tty to start * * Start a tty that has been stopped if at all possible. If this * tty was previous stopped and is now being started, the driver * start method is invoked and the line discipline woken. * * Locking: * flow_lock */ void __start_tty(struct tty_struct *tty) { if (!tty->stopped || tty->flow_stopped) return; tty->stopped = 0; if (tty->ops->start) tty->ops->start(tty); tty_wakeup(tty); } void start_tty(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->flow_lock, flags); __start_tty(tty); spin_unlock_irqrestore(&tty->flow_lock, flags); } EXPORT_SYMBOL(start_tty); static void tty_update_time(struct timespec64 *time) { time64_t sec = ktime_get_real_seconds(); /* * We only care if the two values differ in anything other than the * lower three bits (i.e every 8 seconds). If so, then we can update * the time of the tty device, otherwise it could be construded as a * security leak to let userspace know the exact timing of the tty. */ if ((sec ^ time->tv_sec) & ~7) time->tv_sec = sec; } /* * Iterate on the ldisc ->read() function until we've gotten all * the data the ldisc has for us. * * The "cookie" is something that the ldisc read function can fill * in to let us know that there is more data to be had. * * We promise to continue to call the ldisc until it stops returning * data or clears the cookie. The cookie may be something that the * ldisc maintains state for and needs to free. */ static int iterate_tty_read(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file, struct iov_iter *to) { int retval = 0; void *cookie = NULL; unsigned long offset = 0; char kernel_buf[64]; size_t count = iov_iter_count(to); do { int size, copied; size = count > sizeof(kernel_buf) ? sizeof(kernel_buf) : count; size = ld->ops->read(tty, file, kernel_buf, size, &cookie, offset); if (!size) break; if (size < 0) { /* Did we have an earlier error (ie -EFAULT)? */ if (retval) break; retval = size; /* * -EOVERFLOW means we didn't have enough space * for a whole packet, and we shouldn't return * a partial result. */ if (retval == -EOVERFLOW) offset = 0; break; } copied = copy_to_iter(kernel_buf, size, to); offset += copied; count -= copied; /* * If the user copy failed, we still need to do another ->read() * call if we had a cookie to let the ldisc clear up. * * But make sure size is zeroed. */ if (unlikely(copied != size)) { count = 0; retval = -EFAULT; } } while (cookie); /* We always clear tty buffer in case they contained passwords */ memzero_explicit(kernel_buf, sizeof(kernel_buf)); return offset ? offset : retval; } /** * tty_read - read method for tty device files * @file: pointer to tty file * @buf: user buffer * @count: size of user buffer * @ppos: unused * * Perform the read system call function on this terminal device. Checks * for hung up devices before calling the line discipline method. * * Locking: * Locks the line discipline internally while needed. Multiple * read calls may be outstanding in parallel. */ static ssize_t tty_read(struct kiocb *iocb, struct iov_iter *to) { int i; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; if (tty_paranoia_check(tty, inode, "tty_read")) return -EIO; if (!tty || tty_io_error(tty)) return -EIO; /* We want to wait for the line discipline to sort out in this situation */ ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_read(iocb, to); i = -EIO; if (ld->ops->read) i = iterate_tty_read(ld, tty, file, to); tty_ldisc_deref(ld); if (i > 0) tty_update_time(&inode->i_atime); return i; } void tty_write_unlock(struct tty_struct *tty) { mutex_unlock(&tty->atomic_write_lock); wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } int tty_write_lock(struct tty_struct *tty, bool ndelay) { if (!mutex_trylock(&tty->atomic_write_lock)) { if (ndelay) return -EAGAIN; if (mutex_lock_interruptible(&tty->atomic_write_lock)) return -ERESTARTSYS; } return 0; } /* * Split writes up in sane blocksizes to avoid * denial-of-service type attacks */ static inline ssize_t do_tty_write( ssize_t (*write)(struct tty_struct *, struct file *, const unsigned char *, size_t), struct tty_struct *tty, struct file *file, struct iov_iter *from) { size_t count = iov_iter_count(from); ssize_t ret, written = 0; unsigned int chunk; ret = tty_write_lock(tty, file->f_flags & O_NDELAY); if (ret < 0) return ret; /* * We chunk up writes into a temporary buffer. This * simplifies low-level drivers immensely, since they * don't have locking issues and user mode accesses. * * But if TTY_NO_WRITE_SPLIT is set, we should use a * big chunk-size.. * * The default chunk-size is 2kB, because the NTTY * layer has problems with bigger chunks. It will * claim to be able to handle more characters than * it actually does. * * FIXME: This can probably go away now except that 64K chunks * are too likely to fail unless switched to vmalloc... */ chunk = 2048; if (test_bit(TTY_NO_WRITE_SPLIT, &tty->flags)) chunk = 65536; if (count < chunk) chunk = count; /* write_buf/write_cnt is protected by the atomic_write_lock mutex */ if (tty->write_cnt < chunk) { unsigned char *buf_chunk; if (chunk < 1024) chunk = 1024; buf_chunk = kmalloc(chunk, GFP_KERNEL); if (!buf_chunk) { ret = -ENOMEM; goto out; } kfree(tty->write_buf); tty->write_cnt = chunk; tty->write_buf = buf_chunk; } /* Do the write .. */ for (;;) { size_t size = count; if (size > chunk) size = chunk; ret = -EFAULT; if (copy_from_iter(tty->write_buf, size, from) != size) break; ret = write(tty, file, tty->write_buf, size); if (ret <= 0) break; written += ret; if (ret > size) break; /* FIXME! Have Al check this! */ if (ret != size) iov_iter_revert(from, size-ret); count -= ret; if (!count) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; cond_resched(); } if (written) { tty_update_time(&file_inode(file)->i_mtime); ret = written; } out: tty_write_unlock(tty); return ret; } /** * tty_write_message - write a message to a certain tty, not just the console. * @tty: the destination tty_struct * @msg: the message to write * * This is used for messages that need to be redirected to a specific tty. * We don't put it into the syslog queue right now maybe in the future if * really needed. * * We must still hold the BTM and test the CLOSING flag for the moment. */ void tty_write_message(struct tty_struct *tty, char *msg) { if (tty) { mutex_lock(&tty->atomic_write_lock); tty_lock(tty); if (tty->ops->write && tty->count > 0) tty->ops->write(tty, msg, strlen(msg)); tty_unlock(tty); tty_write_unlock(tty); } return; } /** * tty_write - write method for tty device file * @file: tty file pointer * @buf: user data to write * @count: bytes to write * @ppos: unused * * Write data to a tty device via the line discipline. * * Locking: * Locks the line discipline as required * Writes to the tty driver are serialized by the atomic_write_lock * and are then processed in chunks to the device. The line discipline * write method will not be invoked in parallel for each device. */ static ssize_t file_tty_write(struct file *file, struct kiocb *iocb, struct iov_iter *from) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; if (tty_paranoia_check(tty, file_inode(file), "tty_write")) return -EIO; if (!tty || !tty->ops->write || tty_io_error(tty)) return -EIO; /* Short term debug to catch buggy drivers */ if (tty->ops->write_room == NULL) tty_err(tty, "missing write_room method\n"); ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_write(iocb, from); if (!ld->ops->write) ret = -EIO; else ret = do_tty_write(ld->ops->write, tty, file, from); tty_ldisc_deref(ld); return ret; } static ssize_t tty_write(struct kiocb *iocb, struct iov_iter *from) { return file_tty_write(iocb->ki_filp, iocb, from); } ssize_t redirected_tty_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *p = NULL; spin_lock(&redirect_lock); if (redirect) p = get_file(redirect); spin_unlock(&redirect_lock); /* * We know the redirected tty is just another tty, we can can * call file_tty_write() directly with that file pointer. */ if (p) { ssize_t res; res = file_tty_write(p, iocb, iter); fput(p); return res; } return tty_write(iocb, iter); } /** * tty_send_xchar - send priority character * * Send a high priority character to the tty even if stopped * * Locking: none for xchar method, write ordering for write method. */ int tty_send_xchar(struct tty_struct *tty, char ch) { int was_stopped = tty->stopped; if (tty->ops->send_xchar) { down_read(&tty->termios_rwsem); tty->ops->send_xchar(tty, ch); up_read(&tty->termios_rwsem); return 0; } if (tty_write_lock(tty, false) < 0) return -ERESTARTSYS; down_read(&tty->termios_rwsem); if (was_stopped) start_tty(tty); tty->ops->write(tty, &ch, 1); if (was_stopped) stop_tty(tty); up_read(&tty->termios_rwsem); tty_write_unlock(tty); return 0; } static char ptychar[] = "pqrstuvwxyzabcde"; /** * pty_line_name - generate name for a pty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 6 bytes * * Generate a name from a driver reference and write it to the output * buffer. * * Locking: None */ static void pty_line_name(struct tty_driver *driver, int index, char *p) { int i = index + driver->name_base; /* ->name is initialized to "ttyp", but "tty" is expected */ sprintf(p, "%s%c%x", driver->subtype == PTY_TYPE_SLAVE ? "tty" : driver->name, ptychar[i >> 4 & 0xf], i & 0xf); } /** * tty_line_name - generate name for a tty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 7 bytes * * Generate a name from a driver reference and write it to the output * buffer. * * Locking: None */ static ssize_t tty_line_name(struct tty_driver *driver, int index, char *p) { if (driver->flags & TTY_DRIVER_UNNUMBERED_NODE) return sprintf(p, "%s", driver->name); else return sprintf(p, "%s%d", driver->name, index + driver->name_base); } /** * tty_driver_lookup_tty() - find an existing tty, if any * @driver: the driver for the tty * @idx: the minor number * * Return the tty, if found. If not found, return NULL or ERR_PTR() if the * driver lookup() method returns an error. * * Locking: tty_mutex must be held. If the tty is found, bump the tty kref. */ static struct tty_struct *tty_driver_lookup_tty(struct tty_driver *driver, struct file *file, int idx) { struct tty_struct *tty; if (driver->ops->lookup) { if (!file) tty = ERR_PTR(-EIO); else tty = driver->ops->lookup(driver, file, idx); } else { if (idx >= driver->num) return ERR_PTR(-EINVAL); tty = driver->ttys[idx]; } if (!IS_ERR(tty)) tty_kref_get(tty); return tty; } /** * tty_init_termios - helper for termios setup * @tty: the tty to set up * * Initialise the termios structure for this tty. This runs under * the tty_mutex currently so we can be relaxed about ordering. */ void tty_init_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) tty->termios = tty->driver->init_termios; else { /* Check for lazy saved data */ tp = tty->driver->termios[idx]; if (tp != NULL) { tty->termios = *tp; tty->termios.c_line = tty->driver->init_termios.c_line; } else tty->termios = tty->driver->init_termios; } /* Compatibility until drivers always set this */ tty->termios.c_ispeed = tty_termios_input_baud_rate(&tty->termios); tty->termios.c_ospeed = tty_termios_baud_rate(&tty->termios); } EXPORT_SYMBOL_GPL(tty_init_termios); int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty) { tty_init_termios(tty); tty_driver_kref_get(driver); tty->count++; driver->ttys[tty->index] = tty; return 0; } EXPORT_SYMBOL_GPL(tty_standard_install); /** * tty_driver_install_tty() - install a tty entry in the driver * @driver: the driver for the tty * @tty: the tty * * Install a tty object into the driver tables. The tty->index field * will be set by the time this is called. This method is responsible * for ensuring any need additional structures are allocated and * configured. * * Locking: tty_mutex for now */ static int tty_driver_install_tty(struct tty_driver *driver, struct tty_struct *tty) { return driver->ops->install ? driver->ops->install(driver, tty) : tty_standard_install(driver, tty); } /** * tty_driver_remove_tty() - remove a tty from the driver tables * @driver: the driver for the tty * @tty: tty to remove * * Remvoe a tty object from the driver tables. The tty->index field * will be set by the time this is called. * * Locking: tty_mutex for now */ static void tty_driver_remove_tty(struct tty_driver *driver, struct tty_struct *tty) { if (driver->ops->remove) driver->ops->remove(driver, tty); else driver->ttys[tty->index] = NULL; } /** * tty_reopen() - fast re-open of an open tty * @tty: the tty to open * * Return 0 on success, -errno on error. * Re-opens on master ptys are not allowed and return -EIO. * * Locking: Caller must hold tty_lock */ static int tty_reopen(struct tty_struct *tty) { struct tty_driver *driver = tty->driver; struct tty_ldisc *ld; int retval = 0; if (driver->type == TTY_DRIVER_TYPE_PTY && driver->subtype == PTY_TYPE_MASTER) return -EIO; if (!tty->count) return -EAGAIN; if (test_bit(TTY_EXCLUSIVE, &tty->flags) && !capable(CAP_SYS_ADMIN)) return -EBUSY; ld = tty_ldisc_ref_wait(tty); if (ld) { tty_ldisc_deref(ld); } else { retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) return retval; if (!tty->ldisc) retval = tty_ldisc_reinit(tty, tty->termios.c_line); tty_ldisc_unlock(tty); } if (retval == 0) tty->count++; return retval; } /** * tty_init_dev - initialise a tty device * @driver: tty driver we are opening a device on * @idx: device index * * Prepare a tty device. This may not be a "new" clean device but * could also be an active device. The pty drivers require special * handling because of this. * * Locking: * The function is called under the tty_mutex, which * protects us from the tty struct or driver itself going away. * * On exit the tty device has the line discipline attached and * a reference count of 1. If a pair was created for pty/tty use * and the other was a pty master then it too has a reference count of 1. * * WSH 06/09/97: Rewritten to remove races and properly clean up after a * failed open. The new code protects the open with a mutex, so it's * really quite straightforward. The mutex locking can probably be * relaxed for the (most common) case of reopening a tty. * * Return: returned tty structure */ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) { struct tty_struct *tty; int retval; /* * First time open is complex, especially for PTY devices. * This code guarantees that either everything succeeds and the * TTY is ready for operation, or else the table slots are vacated * and the allocated memory released. (Except that the termios * may be retained.) */ if (!try_module_get(driver->owner)) return ERR_PTR(-ENODEV); tty = alloc_tty_struct(driver, idx); if (!tty) { retval = -ENOMEM; goto err_module_put; } tty_lock(tty); retval = tty_driver_install_tty(driver, tty); if (retval < 0) goto err_free_tty; if (!tty->port) tty->port = driver->ports[idx]; if (WARN_RATELIMIT(!tty->port, "%s: %s driver does not set tty->port. This would crash the kernel. Fix the driver!\n", __func__, tty->driver->name)) { retval = -EINVAL; goto err_release_lock; } retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) goto err_release_lock; tty->port->itty = tty; /* * Structures all installed ... call the ldisc open routines. * If we fail here just call release_tty to clean up. No need * to decrement the use counts, as release_tty doesn't care. */ retval = tty_ldisc_setup(tty, tty->link); if (retval) goto err_release_tty; tty_ldisc_unlock(tty); /* Return the tty locked so that it cannot vanish under the caller */ return tty; err_free_tty: tty_unlock(tty); free_tty_struct(tty); err_module_put: module_put(driver->owner); return ERR_PTR(retval); /* call the tty release_tty routine to clean out this slot */ err_release_tty: tty_ldisc_unlock(tty); tty_info_ratelimited(tty, "ldisc open failed (%d), clearing slot %d\n", retval, idx); err_release_lock: tty_unlock(tty); release_tty(tty, idx); return ERR_PTR(retval); } /** * tty_save_termios() - save tty termios data in driver table * @tty: tty whose termios data to save * * Locking: Caller guarantees serialisation with tty_init_termios(). */ void tty_save_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; /* If the port is going to reset then it has no termios to save */ if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) return; /* Stash the termios data */ tp = tty->driver->termios[idx]; if (tp == NULL) { tp = kmalloc(sizeof(*tp), GFP_KERNEL); if (tp == NULL) return; tty->driver->termios[idx] = tp; } *tp = tty->termios; } EXPORT_SYMBOL_GPL(tty_save_termios); /** * tty_flush_works - flush all works of a tty/pty pair * @tty: tty device to flush works for (or either end of a pty pair) * * Sync flush all works belonging to @tty (and the 'other' tty). */ static void tty_flush_works(struct tty_struct *tty) { flush_work(&tty->SAK_work); flush_work(&tty->hangup_work); if (tty->link) { flush_work(&tty->link->SAK_work); flush_work(&tty->link->hangup_work); } } /** * release_one_tty - release tty structure memory * @work: work of tty we are obliterating * * Releases memory associated with a tty structure, and clears out the * driver table slots. This function is called when a device is no longer * in use. It also gets called when setup of a device fails. * * Locking: * takes the file list lock internally when working on the list * of ttys that the driver keeps. * * This method gets called from a work queue so that the driver private * cleanup ops can sleep (needed for USB at least) */ static void release_one_tty(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); struct tty_driver *driver = tty->driver; struct module *owner = driver->owner; if (tty->ops->cleanup) tty->ops->cleanup(tty); tty->magic = 0; tty_driver_kref_put(driver); module_put(owner); spin_lock(&tty->files_lock); list_del_init(&tty->tty_files); spin_unlock(&tty->files_lock); put_pid(tty->pgrp); put_pid(tty->session); free_tty_struct(tty); } static void queue_release_one_tty(struct kref *kref) { struct tty_struct *tty = container_of(kref, struct tty_struct, kref); /* The hangup queue is now free so we can reuse it rather than waste a chunk of memory for each port */ INIT_WORK(&tty->hangup_work, release_one_tty); schedule_work(&tty->hangup_work); } /** * tty_kref_put - release a tty kref * @tty: tty device * * Release a reference to a tty device and if need be let the kref * layer destruct the object for us */ void tty_kref_put(struct tty_struct *tty) { if (tty) kref_put(&tty->kref, queue_release_one_tty); } EXPORT_SYMBOL(tty_kref_put); /** * release_tty - release tty structure memory * * Release both @tty and a possible linked partner (think pty pair), * and decrement the refcount of the backing module. * * Locking: * tty_mutex * takes the file list lock internally when working on the list * of ttys that the driver keeps. * */ static void release_tty(struct tty_struct *tty, int idx) { /* This should always be true but check for the moment */ WARN_ON(tty->index != idx); WARN_ON(!mutex_is_locked(&tty_mutex)); if (tty->ops->shutdown) tty->ops->shutdown(tty); tty_save_termios(tty); tty_driver_remove_tty(tty->driver, tty); if (tty->port) tty->port->itty = NULL; if (tty->link) tty->link->port->itty = NULL; if (tty->port) tty_buffer_cancel_work(tty->port); if (tty->link) tty_buffer_cancel_work(tty->link->port); tty_kref_put(tty->link); tty_kref_put(tty); } /** * tty_release_checks - check a tty before real release * @tty: tty to check * @idx: index of the tty * * Performs some paranoid checking before true release of the @tty. * This is a no-op unless TTY_PARANOIA_CHECK is defined. */ static int tty_release_checks(struct tty_struct *tty, int idx) { #ifdef TTY_PARANOIA_CHECK if (idx < 0 || idx >= tty->driver->num) { tty_debug(tty, "bad idx %d\n", idx); return -1; } /* not much to check for devpts */ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) return 0; if (tty != tty->driver->ttys[idx]) { tty_debug(tty, "bad driver table[%d] = %p\n", idx, tty->driver->ttys[idx]); return -1; } if (tty->driver->other) { struct tty_struct *o_tty = tty->link; if (o_tty != tty->driver->other->ttys[idx]) { tty_debug(tty, "bad other table[%d] = %p\n", idx, tty->driver->other->ttys[idx]); return -1; } if (o_tty->link != tty) { tty_debug(tty, "bad link = %p\n", o_tty->link); return -1; } } #endif return 0; } /** * tty_kclose - closes tty opened by tty_kopen * @tty: tty device * * Performs the final steps to release and free a tty device. It is the * same as tty_release_struct except that it also resets TTY_PORT_KOPENED * flag on tty->port. */ void tty_kclose(struct tty_struct *tty) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commmences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); tty_port_set_kopened(tty->port, 0); release_tty(tty, tty->index); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_kclose); /** * tty_release_struct - release a tty struct * @tty: tty device * @idx: index of the tty * * Performs the final steps to release and free a tty device. It is * roughly the reverse of tty_init_dev. */ void tty_release_struct(struct tty_struct *tty, int idx) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commmences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); release_tty(tty, idx); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_release_struct); /** * tty_release - vfs callback for close * @inode: inode of tty * @filp: file pointer for handle to tty * * Called the last time each file handle is closed that references * this tty. There may however be several such references. * * Locking: * Takes bkl. See tty_release_dev * * Even releasing the tty structures is a tricky business.. We have * to be very careful that the structures are all released at the * same time, as interrupts might otherwise get the wrong pointers. * * WSH 09/09/97: rewritten to avoid some nasty race conditions that could * lead to double frees or releasing memory still in use. */ int tty_release(struct inode *inode, struct file *filp) { struct tty_struct *tty = file_tty(filp); struct tty_struct *o_tty = NULL; int do_sleep, final; int idx; long timeout = 0; int once = 1; if (tty_paranoia_check(tty, inode, __func__)) return 0; tty_lock(tty); check_tty_count(tty, __func__); __tty_fasync(-1, filp, 0); idx = tty->index; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) o_tty = tty->link; if (tty_release_checks(tty, idx)) { tty_unlock(tty); return 0; } tty_debug_hangup(tty, "releasing (count=%d)\n", tty->count); if (tty->ops->close) tty->ops->close(tty, filp); /* If tty is pty master, lock the slave pty (stable lock order) */ tty_lock_slave(o_tty); /* * Sanity check: if tty->count is going to zero, there shouldn't be * any waiters on tty->read_wait or tty->write_wait. We test the * wait queues and kick everyone out _before_ actually starting to * close. This ensures that we won't block while releasing the tty * structure. * * The test for the o_tty closing is necessary, since the master and * slave sides may close in any order. If the slave side closes out * first, its count will be one, since the master side holds an open. * Thus this test wouldn't be triggered at the time the slave closed, * so we do it now. */ while (1) { do_sleep = 0; if (tty->count <= 1) { if (waitqueue_active(&tty->read_wait)) { wake_up_poll(&tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&tty->write_wait)) { wake_up_poll(&tty->write_wait, EPOLLOUT); do_sleep++; } } if (o_tty && o_tty->count <= 1) { if (waitqueue_active(&o_tty->read_wait)) { wake_up_poll(&o_tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&o_tty->write_wait)) { wake_up_poll(&o_tty->write_wait, EPOLLOUT); do_sleep++; } } if (!do_sleep) break; if (once) { once = 0; tty_warn(tty, "read/write wait queue active!\n"); } schedule_timeout_killable(timeout); if (timeout < 120 * HZ) timeout = 2 * timeout + 1; else timeout = MAX_SCHEDULE_TIMEOUT; } if (o_tty) { if (--o_tty->count < 0) { tty_warn(tty, "bad slave count (%d)\n", o_tty->count); o_tty->count = 0; } } if (--tty->count < 0) { tty_warn(tty, "bad tty->count (%d)\n", tty->count); tty->count = 0; } /* * We've decremented tty->count, so we need to remove this file * descriptor off the tty->tty_files list; this serves two * purposes: * - check_tty_count sees the correct number of file descriptors * associated with this tty. * - do_tty_hangup no longer sees this file descriptor as * something that needs to be handled for hangups. */ tty_del_file(filp); /* * Perform some housekeeping before deciding whether to return. * * If _either_ side is closing, make sure there aren't any * processes that still think tty or o_tty is their controlling * tty. */ if (!tty->count) { read_lock(&tasklist_lock); session_clear_tty(tty->session); if (o_tty) session_clear_tty(o_tty->session); read_unlock(&tasklist_lock); } /* check whether both sides are closing ... */ final = !tty->count && !(o_tty && o_tty->count); tty_unlock_slave(o_tty); tty_unlock(tty); /* At this point, the tty->count == 0 should ensure a dead tty cannot be re-opened by a racing opener */ if (!final) return 0; tty_debug_hangup(tty, "final close\n"); tty_release_struct(tty, idx); return 0; } /** * tty_open_current_tty - get locked tty of current task * @device: device number * @filp: file pointer to tty * @return: locked tty of the current task iff @device is /dev/tty * * Performs a re-open of the current task's controlling tty. * * We cannot return driver and index like for the other nodes because * devpts will not work then. It expects inodes to be from devpts FS. */ static struct tty_struct *tty_open_current_tty(dev_t device, struct file *filp) { struct tty_struct *tty; int retval; if (device != MKDEV(TTYAUX_MAJOR, 0)) return NULL; tty = get_current_tty(); if (!tty) return ERR_PTR(-ENXIO); filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ tty_lock(tty); tty_kref_put(tty); /* safe to drop the kref now */ retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } return tty; } /** * tty_lookup_driver - lookup a tty driver for a given device file * @device: device number * @filp: file pointer to tty * @index: index for the device in the @return driver * @return: driver for this inode (with increased refcount) * * If @return is not erroneous, the caller is responsible to decrement the * refcount by tty_driver_kref_put. * * Locking: tty_mutex protects get_tty_driver */ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp, int *index) { struct tty_driver *driver = NULL; switch (device) { #ifdef CONFIG_VT case MKDEV(TTY_MAJOR, 0): { extern struct tty_driver *console_driver; driver = tty_driver_kref_get(console_driver); *index = fg_console; break; } #endif case MKDEV(TTYAUX_MAJOR, 1): { struct tty_driver *console_driver = console_device(index); if (console_driver) { driver = tty_driver_kref_get(console_driver); if (driver && filp) { /* Don't let /dev/console block */ filp->f_flags |= O_NONBLOCK; break; } } if (driver) tty_driver_kref_put(driver); return ERR_PTR(-ENODEV); } default: driver = get_tty_driver(device, index); if (!driver) return ERR_PTR(-ENODEV); break; } return driver; } /** * tty_kopen - open a tty device for kernel * @device: dev_t of device to open * * Opens tty exclusively for kernel. Performs the driver lookup, * makes sure it's not already opened and performs the first-time * tty initialization. * * Returns the locked initialized &tty_struct * * Claims the global tty_mutex to serialize: * - concurrent first-time tty initialization * - concurrent tty driver removal w/ lookup * - concurrent tty removal from driver table */ struct tty_struct *tty_kopen(dev_t device) { struct tty_struct *tty; struct tty_driver *driver; int index = -1; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, NULL, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, NULL, index); if (IS_ERR(tty)) goto out; if (tty) { /* drop kref from tty_driver_lookup_tty() */ tty_kref_put(tty); tty = ERR_PTR(-EBUSY); } else { /* tty_init_dev returns tty with the tty_lock held */ tty = tty_init_dev(driver, index); if (IS_ERR(tty)) goto out; tty_port_set_kopened(tty->port, 1); } out: mutex_unlock(&tty_mutex); tty_driver_kref_put(driver); return tty; } EXPORT_SYMBOL_GPL(tty_kopen); /** * tty_open_by_driver - open a tty device * @device: dev_t of device to open * @filp: file pointer to tty * * Performs the driver lookup, checks for a reopen, or otherwise * performs the first-time tty initialization. * * Returns the locked initialized or re-opened &tty_struct * * Claims the global tty_mutex to serialize: * - concurrent first-time tty initialization * - concurrent tty driver removal w/ lookup * - concurrent tty removal from driver table */ static struct tty_struct *tty_open_by_driver(dev_t device, struct file *filp) { struct tty_struct *tty; struct tty_driver *driver = NULL; int index = -1; int retval; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, filp, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, filp, index); if (IS_ERR(tty)) { mutex_unlock(&tty_mutex); goto out; } if (tty) { if (tty_port_kopened(tty->port)) { tty_kref_put(tty); mutex_unlock(&tty_mutex); tty = ERR_PTR(-EBUSY); goto out; } mutex_unlock(&tty_mutex); retval = tty_lock_interruptible(tty); tty_kref_put(tty); /* drop kref from tty_driver_lookup_tty() */ if (retval) { if (retval == -EINTR) retval = -ERESTARTSYS; tty = ERR_PTR(retval); goto out; } retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } } else { /* Returns with the tty_lock held for now */ tty = tty_init_dev(driver, index); mutex_unlock(&tty_mutex); } out: tty_driver_kref_put(driver); return tty; } /** * tty_open - open a tty device * @inode: inode of device file * @filp: file pointer to tty * * tty_open and tty_release keep up the tty count that contains the * number of opens done on a tty. We cannot use the inode-count, as * different inodes might point to the same tty. * * Open-counting is needed for pty masters, as well as for keeping * track of serial lines: DTR is dropped when the last close happens. * (This is not done solely through tty->count, now. - Ted 1/27/92) * * The termios state of a pty is reset on first open so that * settings don't persist across reuse. * * Locking: tty_mutex protects tty, tty_lookup_driver and tty_init_dev. * tty->count should protect the rest. * ->siglock protects ->signal/->sighand * * Note: the tty_unlock/lock cases without a ref are only safe due to * tty_mutex */ static int tty_open(struct inode *inode, struct file *filp) { struct tty_struct *tty; int noctty, retval; dev_t device = inode->i_rdev; unsigned saved_flags = filp->f_flags; nonseekable_open(inode, filp); retry_open: retval = tty_alloc_file(filp); if (retval) return -ENOMEM; tty = tty_open_current_tty(device, filp); if (!tty) tty = tty_open_by_driver(device, filp); if (IS_ERR(tty)) { tty_free_file(filp); retval = PTR_ERR(tty); if (retval != -EAGAIN || signal_pending(current)) return retval; schedule(); goto retry_open; } tty_add_file(tty, filp); check_tty_count(tty, __func__); tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); if (tty->ops->open) retval = tty->ops->open(tty, filp); else retval = -ENODEV; filp->f_flags = saved_flags; if (retval) { tty_debug_hangup(tty, "open error %d, releasing\n", retval); tty_unlock(tty); /* need to call tty_release without BTM */ tty_release(inode, filp); if (retval != -ERESTARTSYS) return retval; if (signal_pending(current)) return retval; schedule(); /* * Need to reset f_op in case a hangup happened. */ if (tty_hung_up_p(filp)) filp->f_op = &tty_fops; goto retry_open; } clear_bit(TTY_HUPPED, &tty->flags); noctty = (filp->f_flags & O_NOCTTY) || (IS_ENABLED(CONFIG_VT) && device == MKDEV(TTY_MAJOR, 0)) || device == MKDEV(TTYAUX_MAJOR, 1) || (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER); if (!noctty) tty_open_proc_set_tty(filp, tty); tty_unlock(tty); return 0; } /** * tty_poll - check tty status * @filp: file being polled * @wait: poll wait structures to update * * Call the line discipline polling method to obtain the poll * status of the device. * * Locking: locks called line discipline but ldisc poll method * may be re-entered freely by other callers. */ static __poll_t tty_poll(struct file *filp, poll_table *wait) { struct tty_struct *tty = file_tty(filp); struct tty_ldisc *ld; __poll_t ret = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_poll")) return 0; ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_poll(filp, wait); if (ld->ops->poll) ret = ld->ops->poll(tty, filp, wait); tty_ldisc_deref(ld); return ret; } static int __tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); unsigned long flags; int retval = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync")) goto out; retval = fasync_helper(fd, filp, on, &tty->fasync); if (retval <= 0) goto out; if (on) { enum pid_type type; struct pid *pid; spin_lock_irqsave(&tty->ctrl_lock, flags); if (tty->pgrp) { pid = tty->pgrp; type = PIDTYPE_PGID; } else { pid = task_pid(current); type = PIDTYPE_TGID; } get_pid(pid); spin_unlock_irqrestore(&tty->ctrl_lock, flags); __f_setown(filp, pid, type, 0); put_pid(pid); retval = 0; } out: return retval; } static int tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); int retval = -ENOTTY; tty_lock(tty); if (!tty_hung_up_p(filp)) retval = __tty_fasync(fd, filp, on); tty_unlock(tty); return retval; } /** * tiocsti - fake input character * @tty: tty to fake input into * @p: pointer to character * * Fake input to a tty device. Does the necessary locking and * input management. * * FIXME: does not honour flow control ?? * * Locking: * Called functions take tty_ldiscs_lock * current->signal->tty check is safe without locks */ static int tiocsti(struct tty_struct *tty, char __user *p) { char ch, mbz = 0; struct tty_ldisc *ld; if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ch, p)) return -EFAULT; tty_audit_tiocsti(tty, ch); ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; tty_buffer_lock_exclusive(tty->port); if (ld->ops->receive_buf) ld->ops->receive_buf(tty, &ch, &mbz, 1); tty_buffer_unlock_exclusive(tty->port); tty_ldisc_deref(ld); return 0; } /** * tiocgwinsz - implement window query ioctl * @tty: tty * @arg: user buffer for result * * Copies the kernel idea of the window size into the user buffer. * * Locking: tty->winsize_mutex is taken to ensure the winsize data * is consistent. */ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user *arg) { int err; mutex_lock(&tty->winsize_mutex); err = copy_to_user(arg, &tty->winsize, sizeof(*arg)); mutex_unlock(&tty->winsize_mutex); return err ? -EFAULT: 0; } /** * tty_do_resize - resize event * @tty: tty being resized * @ws: new dimensions * * Update the termios variables and send the necessary signals to * peform a terminal resize correctly */ int tty_do_resize(struct tty_struct *tty, struct winsize *ws) { struct pid *pgrp; /* Lock the tty */ mutex_lock(&tty->winsize_mutex); if (!memcmp(ws, &tty->winsize, sizeof(*ws))) goto done; /* Signal the foreground process group */ pgrp = tty_get_pgrp(tty); if (pgrp) kill_pgrp(pgrp, SIGWINCH, 1); put_pid(pgrp); tty->winsize = *ws; done: mutex_unlock(&tty->winsize_mutex); return 0; } EXPORT_SYMBOL(tty_do_resize); /** * tiocswinsz - implement window size set ioctl * @tty: tty side of tty * @arg: user buffer for result * * Copies the user idea of the window size to the kernel. Traditionally * this is just advisory information but for the Linux console it * actually has driver level meaning and triggers a VC resize. * * Locking: * Driver dependent. The default do_resize method takes the * tty termios mutex and ctrl_lock. The console takes its own lock * then calls into the default method. */ static int tiocswinsz(struct tty_struct *tty, struct winsize __user *arg) { struct winsize tmp_ws; if (copy_from_user(&tmp_ws, arg, sizeof(*arg))) return -EFAULT; if (tty->ops->resize) return tty->ops->resize(tty, &tmp_ws); else return tty_do_resize(tty, &tmp_ws); } /** * tioccons - allow admin to move logical console * @file: the file to become console * * Allow the administrator to move the redirected console device * * Locking: uses redirect_lock to guard the redirect information */ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (file->f_op->write_iter == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); f = redirect; redirect = NULL; spin_unlock(&redirect_lock); if (f) fput(f); return 0; } if (file->f_op->write_iter != tty_write) return -ENOTTY; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; spin_lock(&redirect_lock); if (redirect) { spin_unlock(&redirect_lock); return -EBUSY; } redirect = get_file(file); spin_unlock(&redirect_lock); return 0; } /** * tiocsetd - set line discipline * @tty: tty device * @p: pointer to user data * * Set the line discipline according to user request. * * Locking: see tty_set_ldisc, this function is just a helper */ static int tiocsetd(struct tty_struct *tty, int __user *p) { int disc; int ret; if (get_user(disc, p)) return -EFAULT; ret = tty_set_ldisc(tty, disc); return ret; } /** * tiocgetd - get line discipline * @tty: tty device * @p: pointer to user data * * Retrieves the line discipline id directly from the ldisc. * * Locking: waits for ldisc reference (in case the line discipline * is changing or the tty is being hungup) */ static int tiocgetd(struct tty_struct *tty, int __user *p) { struct tty_ldisc *ld; int ret; ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; ret = put_user(ld->ops->num, p); tty_ldisc_deref(ld); return ret; } /** * send_break - performed time break * @tty: device to break on * @duration: timeout in mS * * Perform a timed break on hardware that lacks its own driver level * timed break functionality. * * Locking: * atomic_write_lock serializes * */ static int send_break(struct tty_struct *tty, unsigned int duration) { int retval; if (tty->ops->break_ctl == NULL) return 0; if (tty->driver->flags & TTY_DRIVER_HARDWARE_BREAK) return tty->ops->break_ctl(tty, duration); /* Do the work ourselves */ if (tty_write_lock(tty, false) < 0) return -EINTR; retval = tty->ops->break_ctl(tty, -1); if (!retval) { msleep_interruptible(duration); retval = tty->ops->break_ctl(tty, 0); } else if (retval == -EOPNOTSUPP) { /* some drivers can tell only dynamically */ retval = 0; } tty_write_unlock(tty); if (signal_pending(current)) retval = -EINTR; return retval; } /** * tty_tiocmget - get modem status * @tty: tty device * @p: pointer to result * * Obtain the modem status bits from the tty driver if the feature * is supported. Return -ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmget(struct tty_struct *tty, int __user *p) { int retval = -ENOTTY; if (tty->ops->tiocmget) { retval = tty->ops->tiocmget(tty); if (retval >= 0) retval = put_user(retval, p); } return retval; } /** * tty_tiocmset - set modem status * @tty: tty device * @cmd: command - clear bits, set bits or set all * @p: pointer to desired bits * * Set the modem status bits from the tty driver if the feature * is supported. Return -ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmset(struct tty_struct *tty, unsigned int cmd, unsigned __user *p) { int retval; unsigned int set, clear, val; if (tty->ops->tiocmset == NULL) return -ENOTTY; retval = get_user(val, p); if (retval) return retval; set = clear = 0; switch (cmd) { case TIOCMBIS: set = val; break; case TIOCMBIC: clear = val; break; case TIOCMSET: set = val; clear = ~val; break; } set &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; clear &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; return tty->ops->tiocmset(tty, set, clear); } static int tty_tiocgicount(struct tty_struct *tty, void __user *arg) { int retval = -EINVAL; struct serial_icounter_struct icount; memset(&icount, 0, sizeof(icount)); if (tty->ops->get_icount) retval = tty->ops->get_icount(tty, &icount); if (retval != 0) return retval; if (copy_to_user(arg, &icount, sizeof(icount))) return -EFAULT; return 0; } static int tty_tiocsserial(struct tty_struct *tty, struct serial_struct __user *ss) { static DEFINE_RATELIMIT_STATE(depr_flags, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); char comm[TASK_COMM_LEN]; struct serial_struct v; int flags; if (copy_from_user(&v, ss, sizeof(*ss))) return -EFAULT; flags = v.flags & ASYNC_DEPRECATED; if (flags && __ratelimit(&depr_flags)) pr_warn("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n", __func__, get_task_comm(comm, current), flags); if (!tty->ops->set_serial) return -ENOTTY; return tty->ops->set_serial(tty, &v); } static int tty_tiocgserial(struct tty_struct *tty, struct serial_struct __user *ss) { struct serial_struct v; int err; memset(&v, 0, sizeof(v)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err && copy_to_user(ss, &v, sizeof(v))) err = -EFAULT; return err; } /* * if pty, return the slave side (real_tty) * otherwise, return self */ static struct tty_struct *tty_pair_get_tty(struct tty_struct *tty) { if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) tty = tty->link; return tty; } /* * Split this up, as gcc can choke on it otherwise.. */ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_struct *real_tty; void __user *p = (void __user *)arg; int retval; struct tty_ldisc *ld; if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; real_tty = tty_pair_get_tty(tty); /* * Factor out some common prep work */ switch (cmd) { case TIOCSETD: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: retval = tty_check_change(tty); if (retval) return retval; if (cmd != TIOCCBRK) { tty_wait_until_sent(tty, 0); if (signal_pending(current)) return -EINTR; } break; } /* * Now do the stuff. */ switch (cmd) { case TIOCSTI: return tiocsti(tty, p); case TIOCGWINSZ: return tiocgwinsz(real_tty, p); case TIOCSWINSZ: return tiocswinsz(real_tty, p); case TIOCCONS: return real_tty != tty ? -EINVAL : tioccons(file); case TIOCEXCL: set_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCNXCL: clear_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCGEXCL: { int excl = test_bit(TTY_EXCLUSIVE, &tty->flags); return put_user(excl, (int __user *)p); } case TIOCGETD: return tiocgetd(tty, p); case TIOCSETD: return tiocsetd(tty, p); case TIOCVHANGUP: if (!capable(CAP_SYS_ADMIN)) return -EPERM; tty_vhangup(tty); return 0; case TIOCGDEV: { unsigned int ret = new_encode_dev(tty_devnum(real_tty)); return put_user(ret, (unsigned int __user *)p); } /* * Break handling */ case TIOCSBRK: /* Turn break on, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, -1); return 0; case TIOCCBRK: /* Turn break off, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, 0); return 0; case TCSBRK: /* SVID version: non-zero arg --> no break */ /* non-zero arg means wait for all output data * to be sent (performed above) but don't send break. * This is used by the tcdrain() termios function. */ if (!arg) return send_break(tty, 250); return 0; case TCSBRKP: /* support for POSIX tcsendbreak() */ return send_break(tty, arg ? arg*100 : 250); case TIOCMGET: return tty_tiocmget(tty, p); case TIOCMSET: case TIOCMBIC: case TIOCMBIS: return tty_tiocmset(tty, cmd, p); case TIOCGICOUNT: return tty_tiocgicount(tty, p); case TCFLSH: switch (arg) { case TCIFLUSH: case TCIOFLUSH: /* flush tty buffer and allow ldisc to process ioctl */ tty_buffer_flush(tty, NULL); break; } break; case TIOCSSERIAL: return tty_tiocsserial(tty, p); case TIOCGSERIAL: return tty_tiocgserial(tty, p); case TIOCGPTPEER: /* Special because the struct file is needed */ return ptm_open_peer(file, tty, (int)arg); default: retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } if (tty->ops->ioctl) { retval = tty->ops->ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_ioctl(file, cmd, arg); retval = -EINVAL; if (ld->ops->ioctl) { retval = ld->ops->ioctl(tty, file, cmd, arg); if (retval == -ENOIOCTLCMD) retval = -ENOTTY; } tty_ldisc_deref(ld); return retval; } #ifdef CONFIG_COMPAT struct serial_struct32 { compat_int_t type; compat_int_t line; compat_uint_t port; compat_int_t irq; compat_int_t flags; compat_int_t xmit_fifo_size; compat_int_t custom_divisor; compat_int_t baud_base; unsigned short close_delay; char io_type; char reserved_char; compat_int_t hub6; unsigned short closing_wait; /* time to wait before closing */ unsigned short closing_wait2; /* no longer used... */ compat_uint_t iomem_base; unsigned short iomem_reg_shift; unsigned int port_high; /* compat_ulong_t iomap_base FIXME */ compat_int_t reserved; }; static int compat_tty_tiocsserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { static DEFINE_RATELIMIT_STATE(depr_flags, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); char comm[TASK_COMM_LEN]; struct serial_struct32 v32; struct serial_struct v; int flags; if (copy_from_user(&v32, ss, sizeof(*ss))) return -EFAULT; memcpy(&v, &v32, offsetof(struct serial_struct32, iomem_base)); v.iomem_base = compat_ptr(v32.iomem_base); v.iomem_reg_shift = v32.iomem_reg_shift; v.port_high = v32.port_high; v.iomap_base = 0; flags = v.flags & ASYNC_DEPRECATED; if (flags && __ratelimit(&depr_flags)) pr_warn("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n", __func__, get_task_comm(comm, current), flags); if (!tty->ops->set_serial) return -ENOTTY; return tty->ops->set_serial(tty, &v); } static int compat_tty_tiocgserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { struct serial_struct32 v32; struct serial_struct v; int err; memset(&v, 0, sizeof(v)); memset(&v32, 0, sizeof(v32)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err) { memcpy(&v32, &v, offsetof(struct serial_struct32, iomem_base)); v32.iomem_base = (unsigned long)v.iomem_base >> 32 ? 0xfffffff : ptr_to_compat(v.iomem_base); v32.iomem_reg_shift = v.iomem_reg_shift; v32.port_high = v.port_high; if (copy_to_user(ss, &v32, sizeof(v32))) err = -EFAULT; } return err; } static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; int retval = -ENOIOCTLCMD; switch (cmd) { case TIOCOUTQ: case TIOCSTI: case TIOCGWINSZ: case TIOCSWINSZ: case TIOCGEXCL: case TIOCGETD: case TIOCSETD: case TIOCGDEV: case TIOCMGET: case TIOCMSET: case TIOCMBIC: case TIOCMBIS: case TIOCGICOUNT: case TIOCGPGRP: case TIOCSPGRP: case TIOCGSID: case TIOCSERGETLSR: case TIOCGRS485: case TIOCSRS485: #ifdef TIOCGETP case TIOCGETP: case TIOCSETP: case TIOCSETN: #endif #ifdef TIOCGETC case TIOCGETC: case TIOCSETC: #endif #ifdef TIOCGLTC case TIOCGLTC: case TIOCSLTC: #endif case TCSETSF: case TCSETSW: case TCSETS: case TCGETS: #ifdef TCGETS2 case TCGETS2: case TCSETSF2: case TCSETSW2: case TCSETS2: #endif case TCGETA: case TCSETAF: case TCSETAW: case TCSETA: case TIOCGLCKTRMIOS: case TIOCSLCKTRMIOS: #ifdef TCGETX case TCGETX: case TCSETX: case TCSETXW: case TCSETXF: #endif case TIOCGSOFTCAR: case TIOCSSOFTCAR: case PPPIOCGCHAN: case PPPIOCGUNIT: return tty_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); case TIOCCONS: case TIOCEXCL: case TIOCNXCL: case TIOCVHANGUP: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: case TCFLSH: case TIOCGPTPEER: case TIOCNOTTY: case TIOCSCTTY: case TCXONC: case TIOCMIWAIT: case TIOCSERCONFIG: return tty_ioctl(file, cmd, arg); } if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; switch (cmd) { case TIOCSSERIAL: return compat_tty_tiocsserial(tty, compat_ptr(arg)); case TIOCGSERIAL: return compat_tty_tiocgserial(tty, compat_ptr(arg)); } if (tty->ops->compat_ioctl) { retval = tty->ops->compat_ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_compat_ioctl(file, cmd, arg); if (ld->ops->compat_ioctl) retval = ld->ops->compat_ioctl(tty, file, cmd, arg); if (retval == -ENOIOCTLCMD && ld->ops->ioctl) retval = ld->ops->ioctl(tty, file, (unsigned long)compat_ptr(cmd), arg); tty_ldisc_deref(ld); return retval; } #endif static int this_tty(const void *t, struct file *file, unsigned fd) { if (likely(file->f_op->read_iter != tty_read)) return 0; return file_tty(file) != t ? 0 : fd + 1; } /* * This implements the "Secure Attention Key" --- the idea is to * prevent trojan horses by killing all processes associated with this * tty when the user hits the "Secure Attention Key". Required for * super-paranoid applications --- see the Orange Book for more details. * * This code could be nicer; ideally it should send a HUP, wait a few * seconds, then send a INT, and then a KILL signal. But you then * have to coordinate with the init process, since all processes associated * with the current tty must be dead before the new getty is allowed * to spawn. * * Now, if it would be correct ;-/ The current code has a nasty hole - * it doesn't catch files in flight. We may send the descriptor to ourselves * via AF_UNIX socket, close it and later fetch from socket. FIXME. * * Nasty bug: do_SAK is being called in interrupt context. This can * deadlock. We punt it up to process context. AKPM - 16Mar2001 */ void __do_SAK(struct tty_struct *tty) { #ifdef TTY_SOFT_SAK tty_hangup(tty); #else struct task_struct *g, *p; struct pid *session; int i; unsigned long flags; if (!tty) return; spin_lock_irqsave(&tty->ctrl_lock, flags); session = get_pid(tty->session); spin_unlock_irqrestore(&tty->ctrl_lock, flags); tty_ldisc_flush(tty); tty_driver_flush_buffer(tty); read_lock(&tasklist_lock); /* Kill the entire session */ do_each_pid_task(session, PIDTYPE_SID, p) { tty_notice(tty, "SAK: killed process %d (%s): by session\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } while_each_pid_task(session, PIDTYPE_SID, p); /* Now kill any processes that happen to have the tty open */ do_each_thread(g, p) { if (p->signal->tty == tty) { tty_notice(tty, "SAK: killed process %d (%s): by controlling tty\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); continue; } task_lock(p); i = iterate_fd(p->files, 0, this_tty, tty); if (i != 0) { tty_notice(tty, "SAK: killed process %d (%s): by fd#%d\n", task_pid_nr(p), p->comm, i - 1); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); put_pid(session); #endif } static void do_SAK_work(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, SAK_work); __do_SAK(tty); } /* * The tq handling here is a little racy - tty->SAK_work may already be queued. * Fortunately we don't need to worry, because if ->SAK_work is already queued, * the values which we write to it will be identical to the values which it * already has. --akpm */ void do_SAK(struct tty_struct *tty) { if (!tty) return; schedule_work(&tty->SAK_work); } EXPORT_SYMBOL(do_SAK); /* Must put_device() after it's unused! */ static struct device *tty_get_device(struct tty_struct *tty) { dev_t devt = tty_devnum(tty); return class_find_device_by_devt(tty_class, devt); } /** * alloc_tty_struct * * This subroutine allocates and initializes a tty structure. * * Locking: none - tty in question is not exposed at this point */ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) { struct tty_struct *tty; tty = kzalloc(sizeof(*tty), GFP_KERNEL); if (!tty) return NULL; kref_init(&tty->kref); tty->magic = TTY_MAGIC; if (tty_ldisc_init(tty)) { kfree(tty); return NULL; } tty->session = NULL; tty->pgrp = NULL; mutex_init(&tty->legacy_mutex); mutex_init(&tty->throttle_mutex); init_rwsem(&tty->termios_rwsem); mutex_init(&tty->winsize_mutex); init_ldsem(&tty->ldisc_sem); init_waitqueue_head(&tty->write_wait); init_waitqueue_head(&tty->read_wait); INIT_WORK(&tty->hangup_work, do_tty_hangup); mutex_init(&tty->atomic_write_lock); spin_lock_init(&tty->ctrl_lock); spin_lock_init(&tty->flow_lock); spin_lock_init(&tty->files_lock); INIT_LIST_HEAD(&tty->tty_files); INIT_WORK(&tty->SAK_work, do_SAK_work); tty->driver = driver; tty->ops = driver->ops; tty->index = idx; tty_line_name(driver, idx, tty->name); tty->dev = tty_get_device(tty); return tty; } /** * tty_put_char - write one character to a tty * @tty: tty * @ch: character * * Write one byte to the tty using the provided put_char method * if present. Returns the number of characters successfully output. * * Note: the specific put_char operation in the driver layer may go * away soon. Don't call it directly, use this method */ int tty_put_char(struct tty_struct *tty, unsigned char ch) { if (tty->ops->put_char) return tty->ops->put_char(tty, ch); return tty->ops->write(tty, &ch, 1); } EXPORT_SYMBOL_GPL(tty_put_char); struct class *tty_class; static int tty_cdev_add(struct tty_driver *driver, dev_t dev, unsigned int index, unsigned int count) { int err; /* init here, since reused cdevs cause crashes */ driver->cdevs[index] = cdev_alloc(); if (!driver->cdevs[index]) return -ENOMEM; driver->cdevs[index]->ops = &tty_fops; driver->cdevs[index]->owner = driver->owner; err = cdev_add(driver->cdevs[index], dev, count); if (err) kobject_put(&driver->cdevs[index]->kobj); return err; } /** * tty_register_device - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to NULL safely. * * Returns a pointer to the struct device for this tty device * (or ERR_PTR(-EFOO) on error). * * This call is required to be made to register an individual tty device * if the tty driver's flags have the TTY_DRIVER_DYNAMIC_DEV bit set. If * that bit is not set, this function should not be called by a tty * driver. * * Locking: ?? */ struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *device) { return tty_register_device_attr(driver, index, device, NULL, NULL); } EXPORT_SYMBOL(tty_register_device); static void tty_device_create_release(struct device *dev) { dev_dbg(dev, "releasing...\n"); kfree(dev); } /** * tty_register_device_attr - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to NULL safely. * @drvdata: Driver data to be set to device. * @attr_grp: Attribute group to be set on device. * * Returns a pointer to the struct device for this tty device * (or ERR_PTR(-EFOO) on error). * * This call is required to be made to register an individual tty device * if the tty driver's flags have the TTY_DRIVER_DYNAMIC_DEV bit set. If * that bit is not set, this function should not be called by a tty * driver. * * Locking: ?? */ struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { char name[64]; dev_t devt = MKDEV(driver->major, driver->minor_start) + index; struct ktermios *tp; struct device *dev; int retval; if (index >= driver->num) { pr_err("%s: Attempt to register invalid tty line number (%d)\n", driver->name, index); return ERR_PTR(-EINVAL); } if (driver->type == TTY_DRIVER_TYPE_PTY) pty_line_name(driver, index, name); else tty_line_name(driver, index, name); dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); dev->devt = devt; dev->class = tty_class; dev->parent = device; dev->release = tty_device_create_release; dev_set_name(dev, "%s", name); dev->groups = attr_grp; dev_set_drvdata(dev, drvdata); dev_set_uevent_suppress(dev, 1); retval = device_register(dev); if (retval) goto err_put; if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { /* * Free any saved termios data so that the termios state is * reset when reusing a minor number. */ tp = driver->termios[index]; if (tp) { driver->termios[index] = NULL; kfree(tp); } retval = tty_cdev_add(driver, devt, index, 1); if (retval) goto err_del; } dev_set_uevent_suppress(dev, 0); kobject_uevent(&dev->kobj, KOBJ_ADD); return dev; err_del: device_del(dev); err_put: put_device(dev); return ERR_PTR(retval); } EXPORT_SYMBOL_GPL(tty_register_device_attr); /** * tty_unregister_device - unregister a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * * If a tty device is registered with a call to tty_register_device() then * this function must be called when the tty device is gone. * * Locking: ?? */ void tty_unregister_device(struct tty_driver *driver, unsigned index) { device_destroy(tty_class, MKDEV(driver->major, driver->minor_start) + index); if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { cdev_del(driver->cdevs[index]); driver->cdevs[index] = NULL; } } EXPORT_SYMBOL(tty_unregister_device); /** * __tty_alloc_driver -- allocate tty driver * @lines: count of lines this driver can handle at most * @owner: module which is responsible for this driver * @flags: some of TTY_DRIVER_* flags, will be set in driver->flags * * This should not be called directly, some of the provided macros should be * used instead. Use IS_ERR and friends on @retval. */ struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner, unsigned long flags) { struct tty_driver *driver; unsigned int cdevs = 1; int err; if (!lines || (flags & TTY_DRIVER_UNNUMBERED_NODE && lines > 1)) return ERR_PTR(-EINVAL); driver = kzalloc(sizeof(*driver), GFP_KERNEL); if (!driver) return ERR_PTR(-ENOMEM); kref_init(&driver->kref); driver->magic = TTY_DRIVER_MAGIC; driver->num = lines; driver->owner = owner; driver->flags = flags; if (!(flags & TTY_DRIVER_DEVPTS_MEM)) { driver->ttys = kcalloc(lines, sizeof(*driver->ttys), GFP_KERNEL); driver->termios = kcalloc(lines, sizeof(*driver->termios), GFP_KERNEL); if (!driver->ttys || !driver->termios) { err = -ENOMEM; goto err_free_all; } } if (!(flags & TTY_DRIVER_DYNAMIC_ALLOC)) { driver->ports = kcalloc(lines, sizeof(*driver->ports), GFP_KERNEL); if (!driver->ports) { err = -ENOMEM; goto err_free_all; } cdevs = lines; } driver->cdevs = kcalloc(cdevs, sizeof(*driver->cdevs), GFP_KERNEL); if (!driver->cdevs) { err = -ENOMEM; goto err_free_all; } return driver; err_free_all: kfree(driver->ports); kfree(driver->ttys); kfree(driver->termios); kfree(driver->cdevs); kfree(driver); return ERR_PTR(err); } EXPORT_SYMBOL(__tty_alloc_driver); static void destruct_tty_driver(struct kref *kref) { struct tty_driver *driver = container_of(kref, struct tty_driver, kref); int i; struct ktermios *tp; if (driver->flags & TTY_DRIVER_INSTALLED) { for (i = 0; i < driver->num; i++) { tp = driver->termios[i]; if (tp) { driver->termios[i] = NULL; kfree(tp); } if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) tty_unregister_device(driver, i); } proc_tty_unregister_driver(driver); if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) cdev_del(driver->cdevs[0]); } kfree(driver->cdevs); kfree(driver->ports); kfree(driver->termios); kfree(driver->ttys); kfree(driver); } void tty_driver_kref_put(struct tty_driver *driver) { kref_put(&driver->kref, destruct_tty_driver); } EXPORT_SYMBOL(tty_driver_kref_put); void tty_set_operations(struct tty_driver *driver, const struct tty_operations *op) { driver->ops = op; }; EXPORT_SYMBOL(tty_set_operations); void put_tty_driver(struct tty_driver *d) { tty_driver_kref_put(d); } EXPORT_SYMBOL(put_tty_driver); /* * Called by a tty driver to register itself. */ int tty_register_driver(struct tty_driver *driver) { int error; int i; dev_t dev; struct device *d; if (!driver->major) { error = alloc_chrdev_region(&dev, driver->minor_start, driver->num, driver->name); if (!error) { driver->major = MAJOR(dev); driver->minor_start = MINOR(dev); } } else { dev = MKDEV(driver->major, driver->minor_start); error = register_chrdev_region(dev, driver->num, driver->name); } if (error < 0) goto err; if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) { error = tty_cdev_add(driver, dev, 0, driver->num); if (error) goto err_unreg_char; } mutex_lock(&tty_mutex); list_add(&driver->tty_drivers, &tty_drivers); mutex_unlock(&tty_mutex); if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) { for (i = 0; i < driver->num; i++) { d = tty_register_device(driver, i, NULL); if (IS_ERR(d)) { error = PTR_ERR(d); goto err_unreg_devs; } } } proc_tty_register_driver(driver); driver->flags |= TTY_DRIVER_INSTALLED; return 0; err_unreg_devs: for (i--; i >= 0; i--) tty_unregister_device(driver, i); mutex_lock(&tty_mutex); list_del(&driver->tty_drivers); mutex_unlock(&tty_mutex); err_unreg_char: unregister_chrdev_region(dev, driver->num); err: return error; } EXPORT_SYMBOL(tty_register_driver); /* * Called by a tty driver to unregister itself. */ int tty_unregister_driver(struct tty_driver *driver) { #if 0 /* FIXME */ if (driver->refcount) return -EBUSY; #endif unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), driver->num); mutex_lock(&tty_mutex); list_del(&driver->tty_drivers); mutex_unlock(&tty_mutex); return 0; } EXPORT_SYMBOL(tty_unregister_driver); dev_t tty_devnum(struct tty_struct *tty) { return MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; } EXPORT_SYMBOL(tty_devnum); void tty_default_fops(struct file_operations *fops) { *fops = tty_fops; } static char *tty_devnode(struct device *dev, umode_t *mode) { if (!mode) return NULL; if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) || dev->devt == MKDEV(TTYAUX_MAJOR, 2)) *mode = 0666; return NULL; } static int __init tty_class_init(void) { tty_class = class_create(THIS_MODULE, "tty"); if (IS_ERR(tty_class)) return PTR_ERR(tty_class); tty_class->devnode = tty_devnode; return 0; } postcore_initcall(tty_class_init); /* 3/2004 jmc: why do these devices exist? */ static struct cdev tty_cdev, console_cdev; static ssize_t show_cons_active(struct device *dev, struct device_attribute *attr, char *buf) { struct console *cs[16]; int i = 0; struct console *c; ssize_t count = 0; console_lock(); for_each_console(c) { if (!c->device) continue; if (!c->write) continue; if ((c->flags & CON_ENABLED) == 0) continue; cs[i++] = c; if (i >= ARRAY_SIZE(cs)) break; } while (i--) { int index = cs[i]->index; struct tty_driver *drv = cs[i]->device(cs[i], &index); /* don't resolve tty0 as some programs depend on it */ if (drv && (cs[i]->index > 0 || drv->major != TTY_MAJOR)) count += tty_line_name(drv, index, buf + count); else count += sprintf(buf + count, "%s%d", cs[i]->name, cs[i]->index); count += sprintf(buf + count, "%c", i ? ' ':'\n'); } console_unlock(); return count; } static DEVICE_ATTR(active, S_IRUGO, show_cons_active, NULL); static struct attribute *cons_dev_attrs[] = { &dev_attr_active.attr, NULL }; ATTRIBUTE_GROUPS(cons_dev); static struct device *consdev; void console_sysfs_notify(void) { if (consdev) sysfs_notify(&consdev->kobj, NULL, "active"); } /* * Ok, now we can initialize the rest of the tty devices and can count * on memory allocations, interrupts etc.. */ int __init tty_init(void) { tty_sysctl_init(); cdev_init(&tty_cdev, &tty_fops); if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0) panic("Couldn't register /dev/tty driver\n"); device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty"); cdev_init(&console_cdev, &console_fops); if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0) panic("Couldn't register /dev/console driver\n"); consdev = device_create_with_groups(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 1), NULL, cons_dev_groups, "console"); if (IS_ERR(consdev)) consdev = NULL; #ifdef CONFIG_VT vty_init(&console_fops); #endif return 0; }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SPINLOCK_H #define __LINUX_SPINLOCK_H /* * include/linux/spinlock.h - generic spinlock/rwlock declarations * * here's the role of the various spinlock/rwlock related include files: * * on SMP builds: * * asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the * initializers * * linux/spinlock_types.h: * defines the generic type and initializers * * asm/spinlock.h: contains the arch_spin_*()/etc. lowlevel * implementations, mostly inline assembly code * * (also included on UP-debug builds:) * * linux/spinlock_api_smp.h: * contains the prototypes for the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. * * on UP builds: * * linux/spinlock_type_up.h: * contains the generic, simplified UP spinlock type. * (which is an empty structure on non-debug builds) * * linux/spinlock_types.h: * defines the generic type and initializers * * linux/spinlock_up.h: * contains the arch_spin_*()/etc. version of UP * builds. (which are NOPs on non-debug, non-preempt * builds) * * (included on UP-non-debug builds:) * * linux/spinlock_api_up.h: * builds the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. */ #include <linux/typecheck.h> #include <linux/preempt.h> #include <linux/linkage.h> #include <linux/compiler.h> #include <linux/irqflags.h> #include <linux/thread_info.h> #include <linux/kernel.h> #include <linux/stringify.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <asm/barrier.h> #include <asm/mmiowb.h> /* * Must define these before including other files, inline functions need them */ #define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME #define LOCK_SECTION_START(extra) \ ".subsection 1\n\t" \ extra \ ".ifndef " LOCK_SECTION_NAME "\n\t" \ LOCK_SECTION_NAME ":\n\t" \ ".endif\n" #define LOCK_SECTION_END \ ".previous\n\t" #define __lockfunc __section(".spinlock.text") /* * Pull the arch_spinlock_t and arch_rwlock_t definitions: */ #include <linux/spinlock_types.h> /* * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them): */ #ifdef CONFIG_SMP # include <asm/spinlock.h> #else # include <linux/spinlock_up.h> #endif #ifdef CONFIG_DEBUG_SPINLOCK extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key, short inner); # define raw_spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ __raw_spin_lock_init((lock), #lock, &__key, LD_WAIT_SPIN); \ } while (0) #else # define raw_spin_lock_init(lock) \ do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0) #endif #define raw_spin_is_locked(lock) arch_spin_is_locked(&(lock)->raw_lock) #ifdef arch_spin_is_contended #define raw_spin_is_contended(lock) arch_spin_is_contended(&(lock)->raw_lock) #else #define raw_spin_is_contended(lock) (((void)(lock), 0)) #endif /*arch_spin_is_contended*/ /* * smp_mb__after_spinlock() provides the equivalent of a full memory barrier * between program-order earlier lock acquisitions and program-order later * memory accesses. * * This guarantees that the following two properties hold: * * 1) Given the snippet: * * { X = 0; Y = 0; } * * CPU0 CPU1 * * WRITE_ONCE(X, 1); WRITE_ONCE(Y, 1); * spin_lock(S); smp_mb(); * smp_mb__after_spinlock(); r1 = READ_ONCE(X); * r0 = READ_ONCE(Y); * spin_unlock(S); * * it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0) * and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments * preceding the call to smp_mb__after_spinlock() in __schedule() and in * try_to_wake_up(). * * 2) Given the snippet: * * { X = 0; Y = 0; } * * CPU0 CPU1 CPU2 * * spin_lock(S); spin_lock(S); r1 = READ_ONCE(Y); * WRITE_ONCE(X, 1); smp_mb__after_spinlock(); smp_rmb(); * spin_unlock(S); r0 = READ_ONCE(X); r2 = READ_ONCE(X); * WRITE_ONCE(Y, 1); * spin_unlock(S); * * it is forbidden that CPU0's critical section executes before CPU1's * critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1) * and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments * preceding the calls to smp_rmb() in try_to_wake_up() for similar * snippets but "projected" onto two CPUs. * * Property (2) upgrades the lock to an RCsc lock. * * Since most load-store architectures implement ACQUIRE with an smp_mb() after * the LL/SC loop, they need no further barriers. Similarly all our TSO * architectures imply an smp_mb() for each atomic instruction and equally don't * need more. * * Architectures that can implement ACQUIRE better need to take care. */ #ifndef smp_mb__after_spinlock #define smp_mb__after_spinlock() do { } while (0) #endif #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); #define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock) extern int do_raw_spin_trylock(raw_spinlock_t *lock); extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock); #else static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) { __acquire(lock); arch_spin_lock(&lock->raw_lock); mmiowb_spin_lock(); } #ifndef arch_spin_lock_flags #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #endif static inline void do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock) { __acquire(lock); arch_spin_lock_flags(&lock->raw_lock, *flags); mmiowb_spin_lock(); } static inline int do_raw_spin_trylock(raw_spinlock_t *lock) { int ret = arch_spin_trylock(&(lock)->raw_lock); if (ret) mmiowb_spin_lock(); return ret; } static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) { mmiowb_spin_unlock(); arch_spin_unlock(&lock->raw_lock); __release(lock); } #endif /* * Define the various spin_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The * various methods are defined as nops in the case they are not * required. */ #define raw_spin_trylock(lock) __cond_lock(lock, _raw_spin_trylock(lock)) #define raw_spin_lock(lock) _raw_spin_lock(lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock_nested(lock, subclass) # define raw_spin_lock_nest_lock(lock, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\ _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) #else /* * Always evaluate the 'subclass' argument to avoid that the compiler * warns about set-but-not-used variables when building with * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1. */ # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock(((void)(subclass), (lock))) # define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock) #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave(lock); \ } while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave_nested(lock, subclass); \ } while (0) #else #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave(lock); \ } while (0) #endif #else #define raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_lock_irqsave(lock, flags); \ } while (0) #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ raw_spin_lock_irqsave(lock, flags) #endif #define raw_spin_lock_irq(lock) _raw_spin_lock_irq(lock) #define raw_spin_lock_bh(lock) _raw_spin_lock_bh(lock) #define raw_spin_unlock(lock) _raw_spin_unlock(lock) #define raw_spin_unlock_irq(lock) _raw_spin_unlock_irq(lock) #define raw_spin_unlock_irqrestore(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_unlock_irqrestore(lock, flags); \ } while (0) #define raw_spin_unlock_bh(lock) _raw_spin_unlock_bh(lock) #define raw_spin_trylock_bh(lock) \ __cond_lock(lock, _raw_spin_trylock_bh(lock)) #define raw_spin_trylock_irq(lock) \ ({ \ local_irq_disable(); \ raw_spin_trylock(lock) ? \ 1 : ({ local_irq_enable(); 0; }); \ }) #define raw_spin_trylock_irqsave(lock, flags) \ ({ \ local_irq_save(flags); \ raw_spin_trylock(lock) ? \ 1 : ({ local_irq_restore(flags); 0; }); \ }) /* Include rwlock functions */ #include <linux/rwlock.h> /* * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: */ #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) # include <linux/spinlock_api_smp.h> #else # include <linux/spinlock_api_up.h> #endif /* * Map the spin_lock functions to the raw variants for PREEMPT_RT=n */ static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock) { return &lock->rlock; } #ifdef CONFIG_DEBUG_SPINLOCK # define spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ __raw_spin_lock_init(spinlock_check(lock), \ #lock, &__key, LD_WAIT_CONFIG); \ } while (0) #else # define spin_lock_init(_lock) \ do { \ spinlock_check(_lock); \ *(_lock) = __SPIN_LOCK_UNLOCKED(_lock); \ } while (0) #endif static __always_inline void spin_lock(spinlock_t *lock) { raw_spin_lock(&lock->rlock); } static __always_inline void spin_lock_bh(spinlock_t *lock) { raw_spin_lock_bh(&lock->rlock); } static __always_inline int spin_trylock(spinlock_t *lock) { return raw_spin_trylock(&lock->rlock); } #define spin_lock_nested(lock, subclass) \ do { \ raw_spin_lock_nested(spinlock_check(lock), subclass); \ } while (0) #define spin_lock_nest_lock(lock, nest_lock) \ do { \ raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ } while (0) static __always_inline void spin_lock_irq(spinlock_t *lock) { raw_spin_lock_irq(&lock->rlock); } #define spin_lock_irqsave(lock, flags) \ do { \ raw_spin_lock_irqsave(spinlock_check(lock), flags); \ } while (0) #define spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \ } while (0) static __always_inline void spin_unlock(spinlock_t *lock) { raw_spin_unlock(&lock->rlock); } static __always_inline void spin_unlock_bh(spinlock_t *lock) { raw_spin_unlock_bh(&lock->rlock); } static __always_inline void spin_unlock_irq(spinlock_t *lock) { raw_spin_unlock_irq(&lock->rlock); } static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { raw_spin_unlock_irqrestore(&lock->rlock, flags); } static __always_inline int spin_trylock_bh(spinlock_t *lock) { return raw_spin_trylock_bh(&lock->rlock); } static __always_inline int spin_trylock_irq(spinlock_t *lock) { return raw_spin_trylock_irq(&lock->rlock); } #define spin_trylock_irqsave(lock, flags) \ ({ \ raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ }) /** * spin_is_locked() - Check whether a spinlock is locked. * @lock: Pointer to the spinlock. * * This function is NOT required to provide any memory ordering * guarantees; it could be used for debugging purposes or, when * additional synchronization is needed, accompanied with other * constructs (memory barriers) enforcing the synchronization. * * Returns: 1 if @lock is locked, 0 otherwise. * * Note that the function only tells you that the spinlock is * seen to be locked, not that it is locked on your CPU. * * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n, * the return value is always 0 (see include/linux/spinlock_up.h). * Therefore you should not rely heavily on the return value. */ static __always_inline int spin_is_locked(spinlock_t *lock) { return raw_spin_is_locked(&lock->rlock); } static __always_inline int spin_is_contended(spinlock_t *lock) { return raw_spin_is_contended(&lock->rlock); } #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) /* * Pull the atomic_t declaration: * (asm-mips/atomic.h needs above definitions) */ #include <linux/atomic.h> /** * atomic_dec_and_lock - lock on reaching reference count zero * @atomic: the atomic counter * @lock: the spinlock in question * * Decrements @atomic by 1. If the result is 0, returns true and locks * @lock. Returns false for all other cases. */ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); #define atomic_dec_and_lock(atomic, lock) \ __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, unsigned long *flags); #define atomic_dec_and_lock_irqsave(atomic, lock, flags) \ __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))) int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, size_t max_size, unsigned int cpu_mult, gfp_t gfp, const char *name, struct lock_class_key *key); #define alloc_bucket_spinlocks(locks, lock_mask, max_size, cpu_mult, gfp) \ ({ \ static struct lock_class_key key; \ int ret; \ \ ret = __alloc_bucket_spinlocks(locks, lock_mask, max_size, \ cpu_mult, gfp, #locks, &key); \ ret; \ }) void free_bucket_spinlocks(spinlock_t *locks); #endif /* __LINUX_SPINLOCK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 /* SPDX-License-Identifier: GPL-2.0 */ /* * Security server interface. * * Author : Stephen Smalley, <sds@tycho.nsa.gov> * */ #ifndef _SELINUX_SECURITY_H_ #define _SELINUX_SECURITY_H_ #include <linux/compiler.h> #include <linux/dcache.h> #include <linux/magic.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include "flask.h" #include "policycap.h" #define SECSID_NULL 0x00000000 /* unspecified SID */ #define SECSID_WILD 0xffffffff /* wildcard SID */ #define SECCLASS_NULL 0x0000 /* no class */ /* Identify specific policy version changes */ #define POLICYDB_VERSION_BASE 15 #define POLICYDB_VERSION_BOOL 16 #define POLICYDB_VERSION_IPV6 17 #define POLICYDB_VERSION_NLCLASS 18 #define POLICYDB_VERSION_VALIDATETRANS 19 #define POLICYDB_VERSION_MLS 19 #define POLICYDB_VERSION_AVTAB 20 #define POLICYDB_VERSION_RANGETRANS 21 #define POLICYDB_VERSION_POLCAP 22 #define POLICYDB_VERSION_PERMISSIVE 23 #define POLICYDB_VERSION_BOUNDARY 24 #define POLICYDB_VERSION_FILENAME_TRANS 25 #define POLICYDB_VERSION_ROLETRANS 26 #define POLICYDB_VERSION_NEW_OBJECT_DEFAULTS 27 #define POLICYDB_VERSION_DEFAULT_TYPE 28 #define POLICYDB_VERSION_CONSTRAINT_NAMES 29 #define POLICYDB_VERSION_XPERMS_IOCTL 30 #define POLICYDB_VERSION_INFINIBAND 31 #define POLICYDB_VERSION_GLBLUB 32 #define POLICYDB_VERSION_COMP_FTRANS 33 /* compressed filename transitions */ /* Range of policy versions we understand*/ #define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE #define POLICYDB_VERSION_MAX POLICYDB_VERSION_COMP_FTRANS /* Mask for just the mount related flags */ #define SE_MNTMASK 0x0f /* Super block security struct flags for mount options */ /* BE CAREFUL, these need to be the low order bits for selinux_get_mnt_opts */ #define CONTEXT_MNT 0x01 #define FSCONTEXT_MNT 0x02 #define ROOTCONTEXT_MNT 0x04 #define DEFCONTEXT_MNT 0x08 #define SBLABEL_MNT 0x10 /* Non-mount related flags */ #define SE_SBINITIALIZED 0x0100 #define SE_SBPROC 0x0200 #define SE_SBGENFS 0x0400 #define SE_SBGENFS_XATTR 0x0800 #define CONTEXT_STR "context" #define FSCONTEXT_STR "fscontext" #define ROOTCONTEXT_STR "rootcontext" #define DEFCONTEXT_STR "defcontext" #define SECLABEL_STR "seclabel" struct netlbl_lsm_secattr; extern int selinux_enabled_boot; /* * type_datum properties * available at the kernel policy version >= POLICYDB_VERSION_BOUNDARY */ #define TYPEDATUM_PROPERTY_PRIMARY 0x0001 #define TYPEDATUM_PROPERTY_ATTRIBUTE 0x0002 /* limitation of boundary depth */ #define POLICYDB_BOUNDS_MAXDEPTH 4 struct selinux_avc; struct selinux_policy; struct selinux_state { #ifdef CONFIG_SECURITY_SELINUX_DISABLE bool disabled; #endif #ifdef CONFIG_SECURITY_SELINUX_DEVELOP bool enforcing; #endif bool checkreqprot; bool initialized; bool policycap[__POLICYDB_CAPABILITY_MAX]; struct page *status_page; struct mutex status_lock; struct selinux_avc *avc; struct selinux_policy __rcu *policy; struct mutex policy_mutex; } __randomize_layout; void selinux_avc_init(struct selinux_avc **avc); extern struct selinux_state selinux_state; static inline bool selinux_initialized(const struct selinux_state *state) { /* do a synchronized load to avoid race conditions */ return smp_load_acquire(&state->initialized); } static inline void selinux_mark_initialized(struct selinux_state *state) { /* do a synchronized write to avoid race conditions */ smp_store_release(&state->initialized, true); } #ifdef CONFIG_SECURITY_SELINUX_DEVELOP static inline bool enforcing_enabled(struct selinux_state *state) { return READ_ONCE(state->enforcing); } static inline void enforcing_set(struct selinux_state *state, bool value) { WRITE_ONCE(state->enforcing, value); } #else static inline bool enforcing_enabled(struct selinux_state *state) { return true; } static inline void enforcing_set(struct selinux_state *state, bool value) { } #endif static inline bool checkreqprot_get(const struct selinux_state *state) { return READ_ONCE(state->checkreqprot); } static inline void checkreqprot_set(struct selinux_state *state, bool value) { WRITE_ONCE(state->checkreqprot, value); } #ifdef CONFIG_SECURITY_SELINUX_DISABLE static inline bool selinux_disabled(struct selinux_state *state) { return READ_ONCE(state->disabled); } static inline void selinux_mark_disabled(struct selinux_state *state) { WRITE_ONCE(state->disabled, true); } #else static inline bool selinux_disabled(struct selinux_state *state) { return false; } #endif static inline bool selinux_policycap_netpeer(void) { struct selinux_state *state = &selinux_state; return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_NETPEER]); } static inline bool selinux_policycap_openperm(void) { struct selinux_state *state = &selinux_state; return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_OPENPERM]); } static inline bool selinux_policycap_extsockclass(void) { struct selinux_state *state = &selinux_state; return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_EXTSOCKCLASS]); } static inline bool selinux_policycap_alwaysnetwork(void) { struct selinux_state *state = &selinux_state; return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_ALWAYSNETWORK]); } static inline bool selinux_policycap_cgroupseclabel(void) { struct selinux_state *state = &selinux_state; return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_CGROUPSECLABEL]); } static inline bool selinux_policycap_nnp_nosuid_transition(void) { struct selinux_state *state = &selinux_state; return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION]); } static inline bool selinux_policycap_genfs_seclabel_symlinks(void) { struct selinux_state *state = &selinux_state; return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS]); } static inline bool selinux_policycap_ioctl_skip_cloexec(void) { struct selinux_state *state = &selinux_state; return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_IOCTL_SKIP_CLOEXEC]); } struct selinux_policy_convert_data; struct selinux_load_state { struct selinux_policy *policy; struct selinux_policy_convert_data *convert_data; }; int security_mls_enabled(struct selinux_state *state); int security_load_policy(struct selinux_state *state, void *data, size_t len, struct selinux_load_state *load_state); void selinux_policy_commit(struct selinux_state *state, struct selinux_load_state *load_state); void selinux_policy_cancel(struct selinux_state *state, struct selinux_load_state *load_state); int security_read_policy(struct selinux_state *state, void **data, size_t *len); int security_policycap_supported(struct selinux_state *state, unsigned int req_cap); #define SEL_VEC_MAX 32 struct av_decision { u32 allowed; u32 auditallow; u32 auditdeny; u32 seqno; u32 flags; }; #define XPERMS_ALLOWED 1 #define XPERMS_AUDITALLOW 2 #define XPERMS_DONTAUDIT 4 #define security_xperm_set(perms, x) (perms[x >> 5] |= 1 << (x & 0x1f)) #define security_xperm_test(perms, x) (1 & (perms[x >> 5] >> (x & 0x1f))) struct extended_perms_data { u32 p[8]; }; struct extended_perms_decision { u8 used; u8 driver; struct extended_perms_data *allowed; struct extended_perms_data *auditallow; struct extended_perms_data *dontaudit; }; struct extended_perms { u16 len; /* length associated decision chain */ struct extended_perms_data drivers; /* flag drivers that are used */ }; /* definitions of av_decision.flags */ #define AVD_FLAGS_PERMISSIVE 0x0001 void security_compute_av(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd, struct extended_perms *xperms); void security_compute_xperms_decision(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u8 driver, struct extended_perms_decision *xpermd); void security_compute_av_user(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd); int security_transition_sid(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, const struct qstr *qstr, u32 *out_sid); int security_transition_sid_user(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, const char *objname, u32 *out_sid); int security_member_sid(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 *out_sid); int security_change_sid(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 *out_sid); int security_sid_to_context(struct selinux_state *state, u32 sid, char **scontext, u32 *scontext_len); int security_sid_to_context_force(struct selinux_state *state, u32 sid, char **scontext, u32 *scontext_len); int security_sid_to_context_inval(struct selinux_state *state, u32 sid, char **scontext, u32 *scontext_len); int security_context_to_sid(struct selinux_state *state, const char *scontext, u32 scontext_len, u32 *out_sid, gfp_t gfp); int security_context_str_to_sid(struct selinux_state *state, const char *scontext, u32 *out_sid, gfp_t gfp); int security_context_to_sid_default(struct selinux_state *state, const char *scontext, u32 scontext_len, u32 *out_sid, u32 def_sid, gfp_t gfp_flags); int security_context_to_sid_force(struct selinux_state *state, const char *scontext, u32 scontext_len, u32 *sid); int security_get_user_sids(struct selinux_state *state, u32 callsid, char *username, u32 **sids, u32 *nel); int security_port_sid(struct selinux_state *state, u8 protocol, u16 port, u32 *out_sid); int security_ib_pkey_sid(struct selinux_state *state, u64 subnet_prefix, u16 pkey_num, u32 *out_sid); int security_ib_endport_sid(struct selinux_state *state, const char *dev_name, u8 port_num, u32 *out_sid); int security_netif_sid(struct selinux_state *state, char *name, u32 *if_sid); int security_node_sid(struct selinux_state *state, u16 domain, void *addr, u32 addrlen, u32 *out_sid); int security_validate_transition(struct selinux_state *state, u32 oldsid, u32 newsid, u32 tasksid, u16 tclass); int security_validate_transition_user(struct selinux_state *state, u32 oldsid, u32 newsid, u32 tasksid, u16 tclass); int security_bounded_transition(struct selinux_state *state, u32 oldsid, u32 newsid); int security_sid_mls_copy(struct selinux_state *state, u32 sid, u32 mls_sid, u32 *new_sid); int security_net_peersid_resolve(struct selinux_state *state, u32 nlbl_sid, u32 nlbl_type, u32 xfrm_sid, u32 *peer_sid); int security_get_classes(struct selinux_policy *policy, char ***classes, int *nclasses); int security_get_permissions(struct selinux_policy *policy, char *class, char ***perms, int *nperms); int security_get_reject_unknown(struct selinux_state *state); int security_get_allow_unknown(struct selinux_state *state); #define SECURITY_FS_USE_XATTR 1 /* use xattr */ #define SECURITY_FS_USE_TRANS 2 /* use transition SIDs, e.g. devpts/tmpfs */ #define SECURITY_FS_USE_TASK 3 /* use task SIDs, e.g. pipefs/sockfs */ #define SECURITY_FS_USE_GENFS 4 /* use the genfs support */ #define SECURITY_FS_USE_NONE 5 /* no labeling support */ #define SECURITY_FS_USE_MNTPOINT 6 /* use mountpoint labeling */ #define SECURITY_FS_USE_NATIVE 7 /* use native label support */ #define SECURITY_FS_USE_MAX 7 /* Highest SECURITY_FS_USE_XXX */ int security_fs_use(struct selinux_state *state, struct super_block *sb); int security_genfs_sid(struct selinux_state *state, const char *fstype, char *name, u16 sclass, u32 *sid); int selinux_policy_genfs_sid(struct selinux_policy *policy, const char *fstype, char *name, u16 sclass, u32 *sid); #ifdef CONFIG_NETLABEL int security_netlbl_secattr_to_sid(struct selinux_state *state, struct netlbl_lsm_secattr *secattr, u32 *sid); int security_netlbl_sid_to_secattr(struct selinux_state *state, u32 sid, struct netlbl_lsm_secattr *secattr); #else static inline int security_netlbl_secattr_to_sid(struct selinux_state *state, struct netlbl_lsm_secattr *secattr, u32 *sid) { return -EIDRM; } static inline int security_netlbl_sid_to_secattr(struct selinux_state *state, u32 sid, struct netlbl_lsm_secattr *secattr) { return -ENOENT; } #endif /* CONFIG_NETLABEL */ const char *security_get_initial_sid_context(u32 sid); /* * status notifier using mmap interface */ extern struct page *selinux_kernel_status_page(struct selinux_state *state); #define SELINUX_KERNEL_STATUS_VERSION 1 struct selinux_kernel_status { u32 version; /* version number of thie structure */ u32 sequence; /* sequence number of seqlock logic */ u32 enforcing; /* current setting of enforcing mode */ u32 policyload; /* times of policy reloaded */ u32 deny_unknown; /* current setting of deny_unknown */ /* * The version > 0 supports above members. */ } __packed; extern void selinux_status_update_setenforce(struct selinux_state *state, int enforcing); extern void selinux_status_update_policyload(struct selinux_state *state, int seqno); extern void selinux_complete_init(void); extern int selinux_disable(struct selinux_state *state); extern void exit_sel_fs(void); extern struct path selinux_null; extern struct vfsmount *selinuxfs_mount; extern void selnl_notify_setenforce(int val); extern void selnl_notify_policyload(u32 seqno); extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm); extern void avtab_cache_init(void); extern void ebitmap_cache_init(void); extern void hashtab_cache_init(void); extern int security_sidtab_hash_stats(struct selinux_state *state, char *page); #endif /* _SELINUX_SECURITY_H_ */
9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 #ifdef CONFIG_PREEMPTIRQ_TRACEPOINTS #undef TRACE_SYSTEM #define TRACE_SYSTEM preemptirq #if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PREEMPTIRQ_H #include <linux/ktime.h> #include <linux/tracepoint.h> #include <linux/string.h> #include <asm/sections.h> DECLARE_EVENT_CLASS(preemptirq_template, TP_PROTO(unsigned long ip, unsigned long parent_ip), TP_ARGS(ip, parent_ip), TP_STRUCT__entry( __field(s32, caller_offs) __field(s32, parent_offs) ), TP_fast_assign( __entry->caller_offs = (s32)(ip - (unsigned long)_stext); __entry->parent_offs = (s32)(parent_ip - (unsigned long)_stext); ), TP_printk("caller=%pS parent=%pS", (void *)((unsigned long)(_stext) + __entry->caller_offs), (void *)((unsigned long)(_stext) + __entry->parent_offs)) ); #ifdef CONFIG_TRACE_IRQFLAGS DEFINE_EVENT(preemptirq_template, irq_disable, TP_PROTO(unsigned long ip, unsigned long parent_ip), TP_ARGS(ip, parent_ip)); DEFINE_EVENT(preemptirq_template, irq_enable, TP_PROTO(unsigned long ip, unsigned long parent_ip), TP_ARGS(ip, parent_ip)); #else #define trace_irq_enable(...) #define trace_irq_disable(...) #define trace_irq_enable_rcuidle(...) #define trace_irq_disable_rcuidle(...) #endif #ifdef CONFIG_TRACE_PREEMPT_TOGGLE DEFINE_EVENT(preemptirq_template, preempt_disable, TP_PROTO(unsigned long ip, unsigned long parent_ip), TP_ARGS(ip, parent_ip)); DEFINE_EVENT(preemptirq_template, preempt_enable, TP_PROTO(unsigned long ip, unsigned long parent_ip), TP_ARGS(ip, parent_ip)); #else #define trace_preempt_enable(...) #define trace_preempt_disable(...) #define trace_preempt_enable_rcuidle(...) #define trace_preempt_disable_rcuidle(...) #endif #endif /* _TRACE_PREEMPTIRQ_H */ #include <trace/define_trace.h> #else /* !CONFIG_PREEMPTIRQ_TRACEPOINTS */ #define trace_irq_enable(...) #define trace_irq_disable(...) #define trace_irq_enable_rcuidle(...) #define trace_irq_disable_rcuidle(...) #define trace_preempt_enable(...) #define trace_preempt_disable(...) #define trace_preempt_enable_rcuidle(...) #define trace_preempt_disable_rcuidle(...) #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_STRING_H_ #define _LINUX_STRING_H_ #include <linux/compiler.h> /* for inline */ #include <linux/types.h> /* for size_t */ #include <linux/stddef.h> /* for NULL */ #include <stdarg.h> #include <uapi/linux/string.h> extern char *strndup_user(const char __user *, long); extern void *memdup_user(const void __user *, size_t); extern void *vmemdup_user(const void __user *, size_t); extern void *memdup_user_nul(const void __user *, size_t); /* * Include machine specific inline routines */ #include <asm/string.h> #ifndef __HAVE_ARCH_STRCPY extern char * strcpy(char *,const char *); #endif #ifndef __HAVE_ARCH_STRNCPY extern char * strncpy(char *,const char *, __kernel_size_t); #endif #ifndef __HAVE_ARCH_STRLCPY size_t strlcpy(char *, const char *, size_t); #endif #ifndef __HAVE_ARCH_STRSCPY ssize_t strscpy(char *, const char *, size_t); #endif /* Wraps calls to strscpy()/memset(), no arch specific code required */ ssize_t strscpy_pad(char *dest, const char *src, size_t count); #ifndef __HAVE_ARCH_STRCAT extern char * strcat(char *, const char *); #endif #ifndef __HAVE_ARCH_STRNCAT extern char * strncat(char *, const char *, __kernel_size_t); #endif #ifndef __HAVE_ARCH_STRLCAT extern size_t strlcat(char *, const char *, __kernel_size_t); #endif #ifndef __HAVE_ARCH_STRCMP extern int strcmp(const char *,const char *); #endif #ifndef __HAVE_ARCH_STRNCMP extern int strncmp(const char *,const char *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_STRCASECMP extern int strcasecmp(const char *s1, const char *s2); #endif #ifndef __HAVE_ARCH_STRNCASECMP extern int strncasecmp(const char *s1, const char *s2, size_t n); #endif #ifndef __HAVE_ARCH_STRCHR extern char * strchr(const char *,int); #endif #ifndef __HAVE_ARCH_STRCHRNUL extern char * strchrnul(const char *,int); #endif extern char * strnchrnul(const char *, size_t, int); #ifndef __HAVE_ARCH_STRNCHR extern char * strnchr(const char *, size_t, int); #endif #ifndef __HAVE_ARCH_STRRCHR extern char * strrchr(const char *,int); #endif extern char * __must_check skip_spaces(const char *); extern char *strim(char *); static inline __must_check char *strstrip(char *str) { return strim(str); } #ifndef __HAVE_ARCH_STRSTR extern char * strstr(const char *, const char *); #endif #ifndef __HAVE_ARCH_STRNSTR extern char * strnstr(const char *, const char *, size_t); #endif #ifndef __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); #endif #ifndef __HAVE_ARCH_STRNLEN extern __kernel_size_t strnlen(const char *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_STRPBRK extern char * strpbrk(const char *,const char *); #endif #ifndef __HAVE_ARCH_STRSEP extern char * strsep(char **,const char *); #endif #ifndef __HAVE_ARCH_STRSPN extern __kernel_size_t strspn(const char *,const char *); #endif #ifndef __HAVE_ARCH_STRCSPN extern __kernel_size_t strcspn(const char *,const char *); #endif #ifndef __HAVE_ARCH_MEMSET extern void * memset(void *,int,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSET16 extern void *memset16(uint16_t *, uint16_t, __kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSET32 extern void *memset32(uint32_t *, uint32_t, __kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSET64 extern void *memset64(uint64_t *, uint64_t, __kernel_size_t); #endif static inline void *memset_l(unsigned long *p, unsigned long v, __kernel_size_t n) { if (BITS_PER_LONG == 32) return memset32((uint32_t *)p, v, n); else return memset64((uint64_t *)p, v, n); } static inline void *memset_p(void **p, void *v, __kernel_size_t n) { if (BITS_PER_LONG == 32) return memset32((uint32_t *)p, (uintptr_t)v, n); else return memset64((uint64_t *)p, (uintptr_t)v, n); } extern void **__memcat_p(void **a, void **b); #define memcat_p(a, b) ({ \ BUILD_BUG_ON_MSG(!__same_type(*(a), *(b)), \ "type mismatch in memcat_p()"); \ (typeof(*a) *)__memcat_p((void **)(a), (void **)(b)); \ }) #ifndef __HAVE_ARCH_MEMCPY extern void * memcpy(void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMMOVE extern void * memmove(void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSCAN extern void * memscan(void *,int,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCMP extern int memcmp(const void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_BCMP extern int bcmp(const void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) { memcpy(dst, src, cnt); } #endif void *memchr_inv(const void *s, int c, size_t n); char *strreplace(char *s, char old, char new); extern void kfree_const(const void *x); extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); extern bool sysfs_streq(const char *s1, const char *s2); int match_string(const char * const *array, size_t n, const char *string); int __sysfs_match_string(const char * const *array, size_t n, const char *s); /** * sysfs_match_string - matches given string in an array * @_a: array of strings * @_s: string to match with * * Helper for __sysfs_match_string(). Calculates the size of @a automatically. */ #define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s) #ifdef CONFIG_BINARY_PRINTF int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available); int ptr_to_hashval(const void *ptr, unsigned long *hashval_out); /** * strstarts - does @str start with @prefix? * @str: string to examine * @prefix: prefix to look for. */ static inline bool strstarts(const char *str, const char *prefix) { return strncmp(str, prefix, strlen(prefix)) == 0; } size_t memweight(const void *ptr, size_t bytes); /** * memzero_explicit - Fill a region of memory (e.g. sensitive * keying data) with 0s. * @s: Pointer to the start of the area. * @count: The size of the area. * * Note: usually using memset() is just fine (!), but in cases * where clearing out _local_ data at the end of a scope is * necessary, memzero_explicit() should be used instead in * order to prevent the compiler from optimising away zeroing. * * memzero_explicit() doesn't need an arch-specific version as * it just invokes the one of memset() implicitly. */ static inline void memzero_explicit(void *s, size_t count) { memset(s, 0, count); barrier_data(s); } /** * kbasename - return the last part of a pathname. * * @path: path to extract the filename from. */ static inline const char *kbasename(const char *path) { const char *tail = strrchr(path, '/'); return tail ? tail + 1 : path; } #define __FORTIFY_INLINE extern __always_inline __attribute__((gnu_inline)) #define __RENAME(x) __asm__(#x) void fortify_panic(const char *name) __noreturn __cold; void __read_overflow(void) __compiletime_error("detected read beyond size of object passed as 1st parameter"); void __read_overflow2(void) __compiletime_error("detected read beyond size of object passed as 2nd parameter"); void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) #ifdef CONFIG_KASAN extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); #else #define __underlying_memchr __builtin_memchr #define __underlying_memcmp __builtin_memcmp #define __underlying_memcpy __builtin_memcpy #define __underlying_memmove __builtin_memmove #define __underlying_memset __builtin_memset #define __underlying_strcat __builtin_strcat #define __underlying_strcpy __builtin_strcpy #define __underlying_strlen __builtin_strlen #define __underlying_strncat __builtin_strncat #define __underlying_strncpy __builtin_strncpy #endif __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); if (__builtin_constant_p(size) && p_size < size) __write_overflow(); if (p_size < size) fortify_panic(__func__); return __underlying_strncpy(p, q, size); } __FORTIFY_INLINE char *strcat(char *p, const char *q) { size_t p_size = __builtin_object_size(p, 0); if (p_size == (size_t)-1) return __underlying_strcat(p, q); if (strlcat(p, q, p_size) >= p_size) fortify_panic(__func__); return p; } __FORTIFY_INLINE __kernel_size_t strlen(const char *p) { __kernel_size_t ret; size_t p_size = __builtin_object_size(p, 0); /* Work around gcc excess stack consumption issue */ if (p_size == (size_t)-1 || (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) fortify_panic(__func__); return ret; } extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) { size_t p_size = __builtin_object_size(p, 0); __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); if (p_size <= ret && maxlen != ret) fortify_panic(__func__); return ret; } /* defined after fortified strlen to reuse it */ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) { size_t ret; size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (p_size == (size_t)-1 && q_size == (size_t)-1) return __real_strlcpy(p, q, size); ret = strlen(q); if (size) { size_t len = (ret >= size) ? size - 1 : ret; if (__builtin_constant_p(len) && len >= p_size) __write_overflow(); if (len >= p_size) fortify_panic(__func__); __underlying_memcpy(p, q, len); p[len] = '\0'; } return ret; } /* defined after fortified strlen and strnlen to reuse them */ __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) { size_t p_len, copy_len; size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (p_size == (size_t)-1 && q_size == (size_t)-1) return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); if (p_size < p_len + copy_len + 1) fortify_panic(__func__); __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; } __FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); if (__builtin_constant_p(size) && p_size < size) __write_overflow(); if (p_size < size) fortify_panic(__func__); return __underlying_memset(p, c, size); } __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (__builtin_constant_p(size)) { if (p_size < size) __write_overflow(); if (q_size < size) __read_overflow2(); } if (p_size < size || q_size < size) fortify_panic(__func__); return __underlying_memcpy(p, q, size); } __FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (__builtin_constant_p(size)) { if (p_size < size) __write_overflow(); if (q_size < size) __read_overflow2(); } if (p_size < size || q_size < size) fortify_panic(__func__); return __underlying_memmove(p, q, size); } extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); __FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); if (__builtin_constant_p(size) && p_size < size) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __real_memscan(p, c, size); } __FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (__builtin_constant_p(size)) { if (p_size < size) __read_overflow(); if (q_size < size) __read_overflow2(); } if (p_size < size || q_size < size) fortify_panic(__func__); return __underlying_memcmp(p, q, size); } __FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); if (__builtin_constant_p(size) && p_size < size) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __underlying_memchr(p, c, size); } void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); __FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size) { size_t p_size = __builtin_object_size(p, 0); if (__builtin_constant_p(size) && p_size < size) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __real_memchr_inv(p, c, size); } extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup); __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) { size_t p_size = __builtin_object_size(p, 0); if (__builtin_constant_p(size) && p_size < size) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __real_kmemdup(p, size, gfp); } /* defined after fortified strlen and memcpy to reuse them */ __FORTIFY_INLINE char *strcpy(char *p, const char *q) { size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (p_size == (size_t)-1 && q_size == (size_t)-1) return __underlying_strcpy(p, q); memcpy(p, q, strlen(q) + 1); return p; } /* Don't use these outside the FORITFY_SOURCE implementation */ #undef __underlying_memchr #undef __underlying_memcmp #undef __underlying_memcpy #undef __underlying_memmove #undef __underlying_memset #undef __underlying_strcat #undef __underlying_strcpy #undef __underlying_strlen #undef __underlying_strncat #undef __underlying_strncpy #endif /** * memcpy_and_pad - Copy one buffer to another with padding * @dest: Where to copy to * @dest_len: The destination buffer size * @src: Where to copy from * @count: The number of bytes to copy * @pad: Character to use for padding if space is left in destination. */ static inline void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, int pad) { if (dest_len > count) { memcpy(dest, src, count); memset(dest + count, pad, dest_len - count); } else memcpy(dest, src, dest_len); } /** * str_has_prefix - Test if a string has a given prefix * @str: The string to test * @prefix: The string to see if @str starts with * * A common way to test a prefix of a string is to do: * strncmp(str, prefix, sizeof(prefix) - 1) * * But this can lead to bugs due to typos, or if prefix is a pointer * and not a constant. Instead use str_has_prefix(). * * Returns: * * strlen(@prefix) if @str starts with @prefix * * 0 if @str does not start with @prefix */ static __always_inline size_t str_has_prefix(const char *str, const char *prefix) { size_t len = strlen(prefix); return strncmp(str, prefix, len) == 0 ? len : 0; } #endif /* _LINUX_STRING_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PTRACE_H #define _ASM_X86_PTRACE_H #include <asm/segment.h> #include <asm/page_types.h> #include <uapi/asm/ptrace.h> #ifndef __ASSEMBLY__ #ifdef __i386__ struct pt_regs { /* * NB: 32-bit x86 CPUs are inconsistent as what happens in the * following cases (where %seg represents a segment register): * * - pushl %seg: some do a 16-bit write and leave the high * bits alone * - movl %seg, [mem]: some do a 16-bit write despite the movl * - IDT entry: some (e.g. 486) will leave the high bits of CS * and (if applicable) SS undefined. * * Fortunately, x86-32 doesn't read the high bits on POP or IRET, * so we can just treat all of the segment registers as 16-bit * values. */ unsigned long bx; unsigned long cx; unsigned long dx; unsigned long si; unsigned long di; unsigned long bp; unsigned long ax; unsigned short ds; unsigned short __dsh; unsigned short es; unsigned short __esh; unsigned short fs; unsigned short __fsh; /* * On interrupt, gs and __gsh store the vector number. They never * store gs any more. */ unsigned short gs; unsigned short __gsh; /* On interrupt, this is the error code. */ unsigned long orig_ax; unsigned long ip; unsigned short cs; unsigned short __csh; unsigned long flags; unsigned long sp; unsigned short ss; unsigned short __ssh; }; #else /* __i386__ */ struct pt_regs { /* * C ABI says these regs are callee-preserved. They aren't saved on kernel entry * unless syscall needs a complete, fully filled "struct pt_regs". */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; /* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; unsigned long r8; unsigned long ax; unsigned long cx; unsigned long dx; unsigned long si; unsigned long di; /* * On syscall entry, this is syscall#. On CPU exception, this is error code. * On hw interrupt, it's IRQ number: */ unsigned long orig_ax; /* Return frame for iretq */ unsigned long ip; unsigned long cs; unsigned long flags; unsigned long sp; unsigned long ss; /* top of stack page */ }; #endif /* !__i386__ */ #ifdef CONFIG_PARAVIRT #include <asm/paravirt_types.h> #endif #include <asm/proto.h> struct cpuinfo_x86; struct task_struct; extern unsigned long profile_pc(struct pt_regs *regs); extern unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code); static inline unsigned long regs_return_value(struct pt_regs *regs) { return regs->ax; } static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) { regs->ax = rc; } /* * user_mode(regs) determines whether a register set came from user * mode. On x86_32, this is true if V8086 mode was enabled OR if the * register set was from protected mode with RPL-3 CS value. This * tricky test checks that with one comparison. * * On x86_64, vm86 mode is mercifully nonexistent, and we don't need * the extra check. */ static __always_inline int user_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL; #else return !!(regs->cs & 3); #endif } static inline int v8086_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 return (regs->flags & X86_VM_MASK); #else return 0; /* No V86 mode support in long mode */ #endif } static inline bool user_64bit_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_64 #ifndef CONFIG_PARAVIRT_XXL /* * On non-paravirt systems, this is the only long mode CPL 3 * selector. We do not allow long mode selectors in the LDT. */ return regs->cs == __USER_CS; #else /* Headers are too twisted for this to go in paravirt.h. */ return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; #endif #else /* !CONFIG_X86_64 */ return false; #endif } /* * Determine whether the register set came from any context that is running in * 64-bit mode. */ static inline bool any_64bit_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_64 return !user_mode(regs) || user_64bit_mode(regs); #else return false; #endif } #ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp static inline bool ip_within_syscall_gap(struct pt_regs *regs) { bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); #ifdef CONFIG_IA32_EMULATION ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); #endif return ret; } #endif static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->sp; } static inline unsigned long instruction_pointer(struct pt_regs *regs) { return regs->ip; } static inline void instruction_pointer_set(struct pt_regs *regs, unsigned long val) { regs->ip = val; } static inline unsigned long frame_pointer(struct pt_regs *regs) { return regs->bp; } static inline unsigned long user_stack_pointer(struct pt_regs *regs) { return regs->sp; } static inline void user_stack_pointer_set(struct pt_regs *regs, unsigned long val) { regs->sp = val; } static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) { return !(regs->flags & X86_EFLAGS_IF); } /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); extern const char *regs_query_register_name(unsigned int offset); #define MAX_REG_OFFSET (offsetof(struct pt_regs, ss)) /** * regs_get_register() - get register value from its offset * @regs: pt_regs from which register value is gotten. * @offset: offset number of the register. * * regs_get_register returns the value of a register. The @offset is the * offset of the register in struct pt_regs address which specified by @regs. * If @offset is bigger than MAX_REG_OFFSET, this returns 0. */ static inline unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset) { if (unlikely(offset > MAX_REG_OFFSET)) return 0; #ifdef CONFIG_X86_32 /* The selector fields are 16-bit. */ if (offset == offsetof(struct pt_regs, cs) || offset == offsetof(struct pt_regs, ss) || offset == offsetof(struct pt_regs, ds) || offset == offsetof(struct pt_regs, es) || offset == offsetof(struct pt_regs, fs) || offset == offsetof(struct pt_regs, gs)) { return *(u16 *)((unsigned long)regs + offset); } #endif return *(unsigned long *)((unsigned long)regs + offset); } /** * regs_within_kernel_stack() - check the address in the stack * @regs: pt_regs which contains kernel stack pointer. * @addr: address which is checked. * * regs_within_kernel_stack() checks @addr is within the kernel stack page(s). * If @addr is within the kernel stack, it returns true. If not, returns false. */ static inline int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1))); } /** * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack * @regs: pt_regs which contains kernel stack pointer. * @n: stack entry number. * * regs_get_kernel_stack_nth() returns the address of the @n th entry of the * kernel stack which is specified by @regs. If the @n th entry is NOT in * the kernel stack, this returns NULL. */ static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n) { unsigned long *addr = (unsigned long *)regs->sp; addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) return addr; else return NULL; } /* To avoid include hell, we can't include uaccess.h */ extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size); /** * regs_get_kernel_stack_nth() - get Nth entry of the stack * @regs: pt_regs which contains kernel stack pointer. * @n: stack entry number. * * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which * is specified by @regs. If the @n th entry is NOT in the kernel stack * this returns 0. */ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) { unsigned long *addr; unsigned long val; long ret; addr = regs_get_kernel_stack_nth_addr(regs, n); if (addr) { ret = copy_from_kernel_nofault(&val, addr, sizeof(val)); if (!ret) return val; } return 0; } /** * regs_get_kernel_argument() - get Nth function argument in kernel * @regs: pt_regs of that context * @n: function argument number (start from 0) * * regs_get_argument() returns @n th argument of the function call. * Note that this chooses most probably assignment, in some case * it can be incorrect. * This is expected to be called from kprobes or ftrace with regs * where the top of stack is the return address. */ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, unsigned int n) { static const unsigned int argument_offs[] = { #ifdef __i386__ offsetof(struct pt_regs, ax), offsetof(struct pt_regs, dx), offsetof(struct pt_regs, cx), #define NR_REG_ARGUMENTS 3 #else offsetof(struct pt_regs, di), offsetof(struct pt_regs, si), offsetof(struct pt_regs, dx), offsetof(struct pt_regs, cx), offsetof(struct pt_regs, r8), offsetof(struct pt_regs, r9), #define NR_REG_ARGUMENTS 6 #endif }; if (n >= NR_REG_ARGUMENTS) { n -= NR_REG_ARGUMENTS - 1; return regs_get_kernel_stack_nth(regs, n); } else return regs_get_register(regs, argument_offs[n]); } #define arch_has_single_step() (1) #ifdef CONFIG_X86_DEBUGCTLMSR #define arch_has_block_step() (1) #else #define arch_has_block_step() (boot_cpu_data.x86 >= 6) #endif #define ARCH_HAS_USER_SINGLE_STEP_REPORT struct user_desc; extern int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *info); extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); #ifdef CONFIG_X86_64 # define do_set_thread_area_64(p, s, t) do_arch_prctl_64(p, s, t) #else # define do_set_thread_area_64(p, s, t) (0) #endif #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PTRACE_H */
8 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 /* SPDX-License-Identifier: GPL-2.0-only */ /* * NSA Security-Enhanced Linux (SELinux) security module * * This file contains the SELinux security data structures for kernel objects. * * Author(s): Stephen Smalley, <sds@tycho.nsa.gov> * Chris Vance, <cvance@nai.com> * Wayne Salamon, <wsalamon@nai.com> * James Morris <jmorris@redhat.com> * * Copyright (C) 2001,2002 Networks Associates Technology, Inc. * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> * Copyright (C) 2016 Mellanox Technologies */ #ifndef _SELINUX_OBJSEC_H_ #define _SELINUX_OBJSEC_H_ #include <linux/list.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/binfmts.h> #include <linux/in.h> #include <linux/spinlock.h> #include <linux/lsm_hooks.h> #include <linux/msg.h> #include <net/net_namespace.h> #include "flask.h" #include "avc.h" struct task_security_struct { u32 osid; /* SID prior to last execve */ u32 sid; /* current SID */ u32 exec_sid; /* exec SID */ u32 create_sid; /* fscreate SID */ u32 keycreate_sid; /* keycreate SID */ u32 sockcreate_sid; /* fscreate SID */ } __randomize_layout; enum label_initialized { LABEL_INVALID, /* invalid or not initialized */ LABEL_INITIALIZED, /* initialized */ LABEL_PENDING }; struct inode_security_struct { struct inode *inode; /* back pointer to inode object */ struct list_head list; /* list of inode_security_struct */ u32 task_sid; /* SID of creating task */ u32 sid; /* SID of this object */ u16 sclass; /* security class of this object */ unsigned char initialized; /* initialization flag */ spinlock_t lock; }; struct file_security_struct { u32 sid; /* SID of open file description */ u32 fown_sid; /* SID of file owner (for SIGIO) */ u32 isid; /* SID of inode at the time of file open */ u32 pseqno; /* Policy seqno at the time of file open */ }; struct superblock_security_struct { struct super_block *sb; /* back pointer to sb object */ u32 sid; /* SID of file system superblock */ u32 def_sid; /* default SID for labeling */ u32 mntpoint_sid; /* SECURITY_FS_USE_MNTPOINT context for files */ unsigned short behavior; /* labeling behavior */ unsigned short flags; /* which mount options were specified */ struct mutex lock; struct list_head isec_head; spinlock_t isec_lock; }; struct msg_security_struct { u32 sid; /* SID of message */ }; struct ipc_security_struct { u16 sclass; /* security class of this object */ u32 sid; /* SID of IPC resource */ }; struct netif_security_struct { struct net *ns; /* network namespace */ int ifindex; /* device index */ u32 sid; /* SID for this interface */ }; struct netnode_security_struct { union { __be32 ipv4; /* IPv4 node address */ struct in6_addr ipv6; /* IPv6 node address */ } addr; u32 sid; /* SID for this node */ u16 family; /* address family */ }; struct netport_security_struct { u32 sid; /* SID for this node */ u16 port; /* port number */ u8 protocol; /* transport protocol */ }; struct sk_security_struct { #ifdef CONFIG_NETLABEL enum { /* NetLabel state */ NLBL_UNSET = 0, NLBL_REQUIRE, NLBL_LABELED, NLBL_REQSKB, NLBL_CONNLABELED, } nlbl_state; struct netlbl_lsm_secattr *nlbl_secattr; /* NetLabel sec attributes */ #endif u32 sid; /* SID of this object */ u32 peer_sid; /* SID of peer */ u16 sclass; /* sock security class */ enum { /* SCTP association state */ SCTP_ASSOC_UNSET = 0, SCTP_ASSOC_SET, } sctp_assoc_state; }; struct tun_security_struct { u32 sid; /* SID for the tun device sockets */ }; struct key_security_struct { u32 sid; /* SID of key */ }; struct ib_security_struct { u32 sid; /* SID of the queue pair or MAD agent */ }; struct pkey_security_struct { u64 subnet_prefix; /* Port subnet prefix */ u16 pkey; /* PKey number */ u32 sid; /* SID of pkey */ }; struct bpf_security_struct { u32 sid; /* SID of bpf obj creator */ }; struct perf_event_security_struct { u32 sid; /* SID of perf_event obj creator */ }; extern struct lsm_blob_sizes selinux_blob_sizes; static inline struct task_security_struct *selinux_cred(const struct cred *cred) { return cred->security + selinux_blob_sizes.lbs_cred; } static inline struct file_security_struct *selinux_file(const struct file *file) { return file->f_security + selinux_blob_sizes.lbs_file; } static inline struct inode_security_struct *selinux_inode( const struct inode *inode) { if (unlikely(!inode->i_security)) return NULL; return inode->i_security + selinux_blob_sizes.lbs_inode; } static inline struct msg_security_struct *selinux_msg_msg( const struct msg_msg *msg_msg) { return msg_msg->security + selinux_blob_sizes.lbs_msg_msg; } static inline struct ipc_security_struct *selinux_ipc( const struct kern_ipc_perm *ipc) { return ipc->security + selinux_blob_sizes.lbs_ipc; } /* * get the subjective security ID of the current task */ static inline u32 current_sid(void) { const struct task_security_struct *tsec = selinux_cred(current_cred()); return tsec->sid; } #endif /* _SELINUX_OBJSEC_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 */ #include <asm/fpu/internal.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> #include <asm/irq_regs.h> #include <linux/hardirq.h> #include <linux/pkeys.h> #define CREATE_TRACE_POINTS #include <asm/trace/fpu.h> /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ union fpregs_state init_fpstate __read_mostly; /* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* * Track which context is using the FPU on the CPU: */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? */ bool irq_fpu_usable(void) { if (WARN_ON_ONCE(in_nmi())) return false; /* In kernel FPU usage already active? */ if (this_cpu_read(in_kernel_fpu)) return false; /* * When not in NMI or hard interrupt context, FPU can be used in: * * - Task context except from within fpregs_lock()'ed critical * regions. * * - Soft interrupt processing context which cannot happen * while in a fpregs_lock()'ed critical region. */ if (!in_irq()) return true; /* * In hard interrupt context it's safe when soft interrupts * are enabled, which means the interrupt did not hit in * a fpregs_lock()'ed critical region. */ return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); /* * These must be called with preempt disabled. Returns * 'true' if the FPU state is still intact and we can * keep registers active. * * The legacy FNSAVE instruction cleared all FPU state * unconditionally, so registers are essentially destroyed. * Modern FPU state can be kept in registers, if there are * no pending FP exceptions. */ int copy_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { copy_xregs_to_kernel(&fpu->state.xsave); /* * AVX512 state is tracked here because its use is * known to slow the max clock speed of the core. */ if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) fpu->avx512_timestamp = jiffies; return 1; } if (likely(use_fxsr())) { copy_fxregs_to_kernel(fpu); return 1; } /* * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to mark them inactive: */ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); return 0; } EXPORT_SYMBOL(copy_fpregs_to_fpstate); void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, true); if (!(current->flags & PF_KTHREAD) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); /* * Ignore return value -- we don't care if reg state * is clobbered. */ copy_fpregs_to_fpstate(&current->thread.fpu); } __cpu_invalidate_fpregs_state(); /* Put sane initial values into the control registers. */ if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) ldmxcsr(MXCSR_DEFAULT); if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) asm volatile ("fninit"); } EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); preempt_enable(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); /* * Save the FPU state (mark it for reload if necessary): * * This only ever gets called for the current task. */ void fpu__save(struct fpu *fpu) { WARN_ON_FPU(fpu != &current->thread.fpu); fpregs_lock(); trace_x86_fpu_before_save(fpu); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } } trace_x86_fpu_after_save(fpu); fpregs_unlock(); } /* * Legacy x87 fpstate state init: */ static inline void fpstate_init_fstate(struct fregs_state *fp) { fp->cwd = 0xffff037fu; fp->swd = 0xffff0000u; fp->twd = 0xffffffffu; fp->fos = 0xffff0000u; } void fpstate_init(union fpregs_state *state) { if (!static_cpu_has(X86_FEATURE_FPU)) { fpstate_init_soft(&state->soft); return; } memset(state, 0, fpu_kernel_xstate_size); if (static_cpu_has(X86_FEATURE_XSAVES)) fpstate_init_xstate(&state->xsave); if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); else fpstate_init_fstate(&state->fsave); } EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct task_struct *dst, struct task_struct *src) { struct fpu *dst_fpu = &dst->thread.fpu; struct fpu *src_fpu = &src->thread.fpu; dst_fpu->last_cpu = -1; if (!static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != &current->thread.fpu); /* * Don't let 'init optimized' areas of the XSAVE area * leak into the child task: */ memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* * If the FPU registers are not current just memcpy() the state. * Otherwise save current FPU registers directly into the child's FPU * context, without any memory-to-memory copying. * * ( The function 'fails' in the FNSAVE case, which destroys * register contents so we have to load them back. ) */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); else if (!copy_fpregs_to_fpstate(dst_fpu)) copy_kernel_to_fpregs(&dst_fpu->state); fpregs_unlock(); set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); return 0; } /* * Activate the current task's in-memory FPU context, * if it has not been used before: */ static void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != &current->thread.fpu); set_thread_flag(TIF_NEED_FPU_LOAD); fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); } /* * This function must be called before we read a task's fpstate. * * There's two cases where this gets called: * * - for the current task (when coredumping), in which case we have * to save the latest FPU registers into the fpstate, * * - or it's called for stopped tasks (ptrace), in which case the * registers were already saved by the context-switch code when * the task scheduled out. * * If the task has used the FPU before then save it. */ void fpu__prepare_read(struct fpu *fpu) { if (fpu == &current->thread.fpu) fpu__save(fpu); } /* * This function must be called before we write a task's fpstate. * * Invalidate any cached FPU registers. * * After this function call, after registers in the fpstate are * modified and the child task has woken up, the child task will * restore the modified FPU state from the modified context. If we * didn't clear its cached status here then the cached in-registers * state pending on its former CPU could be restored, corrupting * the modifications. */ void fpu__prepare_write(struct fpu *fpu) { /* * Only stopped child tasks can be used to modify the FPU * state in the fpstate buffer: */ WARN_ON_FPU(fpu == &current->thread.fpu); /* Invalidate any cached state: */ __fpu_invalidate_fpregs_state(fpu); } /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. * * This function can be used in cases where we know that * a state-restore is coming: either an explicit one, * or a reschedule. */ void fpu__drop(struct fpu *fpu) { preempt_disable(); if (fpu == &current->thread.fpu) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); fpregs_deactivate(fpu); } trace_x86_fpu_dropped(fpu); preempt_enable(); } /* * Clear FPU registers by setting them up from the init fpstate. * Caller must do fpregs_[un]lock() around it. */ static inline void copy_init_fpstate_to_fpregs(u64 features_mask) { if (use_xsave()) copy_kernel_to_xregs(&init_fpstate.xsave, features_mask); else if (static_cpu_has(X86_FEATURE_FXSR)) copy_kernel_to_fxregs(&init_fpstate.fxsave); else copy_kernel_to_fregs(&init_fpstate.fsave); if (boot_cpu_has(X86_FEATURE_OSPKE)) copy_init_pkru_to_fpregs(); } /* * Clear the FPU state back to init state. * * Called by sys_execve(), by the signal handler code and by various * error paths. */ static void fpu__clear(struct fpu *fpu, bool user_only) { WARN_ON_FPU(fpu != &current->thread.fpu); if (!static_cpu_has(X86_FEATURE_FPU)) { fpu__drop(fpu); fpu__initialize(fpu); return; } fpregs_lock(); if (user_only) { if (!fpregs_state_valid(fpu, smp_processor_id()) && xfeatures_mask_supervisor()) copy_kernel_to_xregs(&fpu->state.xsave, xfeatures_mask_supervisor()); copy_init_fpstate_to_fpregs(xfeatures_mask_user()); } else { copy_init_fpstate_to_fpregs(xfeatures_mask_all); } fpregs_mark_activate(); fpregs_unlock(); } void fpu__clear_user_states(struct fpu *fpu) { fpu__clear(fpu, true); } void fpu__clear_all(struct fpu *fpu) { fpu__clear(fpu, false); } /* * Load FPU context before returning to userspace. */ void switch_fpu_return(void) { if (!static_cpu_has(X86_FEATURE_FPU)) return; __fpregs_load_activate(); } EXPORT_SYMBOL_GPL(switch_fpu_return); #ifdef CONFIG_X86_DEBUG_FPU /* * If current FPU state according to its tracking (loaded FPU context on this * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is * loaded on return to userland. */ void fpregs_assert_state_consistent(void) { struct fpu *fpu = &current->thread.fpu; if (test_thread_flag(TIF_NEED_FPU_LOAD)) return; WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); } EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); #endif void fpregs_mark_activate(void) { struct fpu *fpu = &current->thread.fpu; fpregs_activate(fpu); fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } EXPORT_SYMBOL_GPL(fpregs_mark_activate); /* * x87 math exception handling: */ int fpu__exception_code(struct fpu *fpu, int trap_nr) { int err; if (trap_nr == X86_TRAP_MF) { unsigned short cwd, swd; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the * C1 reg you need in case of a stack fault, 0x040 is the stack * fault bit. We should only be taking one exception at a time, * so if this combination doesn't produce any single exception, * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { cwd = fpu->state.fxsave.cwd; swd = fpu->state.fxsave.swd; } else { cwd = (unsigned short)fpu->state.fsave.cwd; swd = (unsigned short)fpu->state.fsave.swd; } err = swd & ~cwd; } else { /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which * unmasked exception was caught we must mask the exception mask bits * at 0x1f80, and then use these to mask the exception bits at 0x3f. */ unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) mxcsr = fpu->state.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ return FPE_FLTINV; } else if (err & 0x004) { /* Divide by Zero */ return FPE_FLTDIV; } else if (err & 0x008) { /* Overflow */ return FPE_FLTOVF; } else if (err & 0x012) { /* Denormal, Underflow */ return FPE_FLTUND; } else if (err & 0x020) { /* Precision */ return FPE_FLTRES; } /* * If we're using IRQ 13, or supposedly even some trap * X86_TRAP_MF implementations, it's possible * we get a spurious trap, which is not an error. */ return 0; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef LLIST_H #define LLIST_H /* * Lock-less NULL terminated single linked list * * Cases where locking is not needed: * If there are multiple producers and multiple consumers, llist_add can be * used in producers and llist_del_all can be used in consumers simultaneously * without locking. Also a single consumer can use llist_del_first while * multiple producers simultaneously use llist_add, without any locking. * * Cases where locking is needed: * If we have multiple consumers with llist_del_first used in one consumer, and * llist_del_first or llist_del_all used in other consumers, then a lock is * needed. This is because llist_del_first depends on list->first->next not * changing, but without lock protection, there's no way to be sure about that * if a preemption happens in the middle of the delete operation and on being * preempted back, the list->first is the same as before causing the cmpxchg in * llist_del_first to succeed. For example, while a llist_del_first operation * is in progress in one consumer, then a llist_del_first, llist_add, * llist_add (or llist_del_all, llist_add, llist_add) sequence in another * consumer may cause violations. * * This can be summarized as follows: * * | add | del_first | del_all * add | - | - | - * del_first | | L | L * del_all | | | - * * Where, a particular row's operation can happen concurrently with a column's * operation, with "-" being no lock needed, while "L" being lock is needed. * * The list entries deleted via llist_del_all can be traversed with * traversing function such as llist_for_each etc. But the list * entries can not be traversed safely before deleted from the list. * The order of deleted entries is from the newest to the oldest added * one. If you want to traverse from the oldest to the newest, you * must reverse the order by yourself before traversing. * * The basic atomic operation of this list is cmpxchg on long. On * architectures that don't have NMI-safe cmpxchg implementation, the * list can NOT be used in NMI handlers. So code that uses the list in * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. * * Copyright 2010,2011 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ #include <linux/atomic.h> #include <linux/kernel.h> struct llist_head { struct llist_node *first; }; struct llist_node { struct llist_node *next; }; #define LLIST_HEAD_INIT(name) { NULL } #define LLIST_HEAD(name) struct llist_head name = LLIST_HEAD_INIT(name) /** * init_llist_head - initialize lock-less list head * @head: the head for your lock-less list */ static inline void init_llist_head(struct llist_head *list) { list->first = NULL; } /** * llist_entry - get the struct of this entry * @ptr: the &struct llist_node pointer. * @type: the type of the struct this is embedded in. * @member: the name of the llist_node within the struct. */ #define llist_entry(ptr, type, member) \ container_of(ptr, type, member) /** * member_address_is_nonnull - check whether the member address is not NULL * @ptr: the object pointer (struct type * that contains the llist_node) * @member: the name of the llist_node within the struct. * * This macro is conceptually the same as * &ptr->member != NULL * but it works around the fact that compilers can decide that taking a member * address is never a NULL pointer. * * Real objects that start at a high address and have a member at NULL are * unlikely to exist, but such pointers may be returned e.g. by the * container_of() macro. */ #define member_address_is_nonnull(ptr, member) \ ((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0) /** * llist_for_each - iterate over some deleted entries of a lock-less list * @pos: the &struct llist_node to use as a loop cursor * @node: the first entry of deleted list entries * * In general, some entries of the lock-less list can be traversed * safely only after being deleted from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each(pos, node) \ for ((pos) = (node); pos; (pos) = (pos)->next) /** * llist_for_each_safe - iterate over some deleted entries of a lock-less list * safe against removal of list entry * @pos: the &struct llist_node to use as a loop cursor * @n: another &struct llist_node to use as temporary storage * @node: the first entry of deleted list entries * * In general, some entries of the lock-less list can be traversed * safely only after being deleted from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each_safe(pos, n, node) \ for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n)) /** * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type * @pos: the type * to use as a loop cursor. * @node: the fist entry of deleted list entries. * @member: the name of the llist_node with the struct. * * In general, some entries of the lock-less list can be traversed * safely only after being removed from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each_entry(pos, node, member) \ for ((pos) = llist_entry((node), typeof(*(pos)), member); \ member_address_is_nonnull(pos, member); \ (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member)) /** * llist_for_each_entry_safe - iterate over some deleted entries of lock-less list of given type * safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @node: the first entry of deleted list entries. * @member: the name of the llist_node with the struct. * * In general, some entries of the lock-less list can be traversed * safely only after being removed from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each_entry_safe(pos, n, node, member) \ for (pos = llist_entry((node), typeof(*pos), member); \ member_address_is_nonnull(pos, member) && \ (n = llist_entry(pos->member.next, typeof(*n), member), true); \ pos = n) /** * llist_empty - tests whether a lock-less list is empty * @head: the list to test * * Not guaranteed to be accurate or up to date. Just a quick way to * test whether the list is empty without deleting something from the * list. */ static inline bool llist_empty(const struct llist_head *head) { return READ_ONCE(head->first) == NULL; } static inline struct llist_node *llist_next(struct llist_node *node) { return node->next; } extern bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, struct llist_head *head); /** * llist_add - add a new entry * @new: new entry to be added * @head: the head for your lock-less list * * Returns true if the list was empty prior to adding this entry. */ static inline bool llist_add(struct llist_node *new, struct llist_head *head) { return llist_add_batch(new, new, head); } /** * llist_del_all - delete all entries from lock-less list * @head: the head of lock-less list to delete all entries * * If list is empty, return NULL, otherwise, delete all entries and * return the pointer to the first entry. The order of entries * deleted is from the newest to the oldest added one. */ static inline struct llist_node *llist_del_all(struct llist_head *head) { return xchg(&head->first, NULL); } extern struct llist_node *llist_del_first(struct llist_head *head); struct llist_node *llist_reverse_order(struct llist_node *head); #endif /* LLIST_H */
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992 obz under the linux copyright * * Dynamic diacritical handling - aeb@cwi.nl - Dec 1993 * Dynamic keymap and string allocation - aeb@cwi.nl - May 1994 * Restrict VT switching via ioctl() - grif@cs.ucr.edu - Dec 1995 * Some code moved for less code duplication - Andi Kleen - Mar 1997 * Check put/get_user, cleanups - acme@conectiva.com.br - Jun 2001 */ #include <linux/types.h> #include <linux/errno.h> #include <linux/sched/signal.h> #include <linux/tty.h> #include <linux/timer.h> #include <linux/kernel.h> #include <linux/compat.h> #include <linux/module.h> #include <linux/kd.h> #include <linux/vt.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/console.h> #include <linux/consolemap.h> #include <linux/signal.h> #include <linux/suspend.h> #include <linux/timex.h> #include <asm/io.h> #include <linux/uaccess.h> #include <linux/nospec.h> #include <linux/kbd_kern.h> #include <linux/vt_kern.h> #include <linux/kbd_diacr.h> #include <linux/selection.h> bool vt_dont_switch; static inline bool vt_in_use(unsigned int i) { const struct vc_data *vc = vc_cons[i].d; /* * console_lock must be held to prevent the vc from being deallocated * while we're checking whether it's in-use. */ WARN_CONSOLE_UNLOCKED(); return vc && kref_read(&vc->port.kref) > 1; } static inline bool vt_busy(int i) { if (vt_in_use(i)) return true; if (i == fg_console) return true; if (vc_is_sel(vc_cons[i].d)) return true; return false; } /* * Console (vt and kd) routines, as defined by USL SVR4 manual, and by * experimentation and study of X386 SYSV handling. * * One point of difference: SYSV vt's are /dev/vtX, which X >= 0, and * /dev/console is a separate ttyp. Under Linux, /dev/tty0 is /dev/console, * and the vc start at /dev/ttyX, X >= 1. We maintain that here, so we will * always treat our set of vt as numbered 1..MAX_NR_CONSOLES (corresponding to * ttys 0..MAX_NR_CONSOLES-1). Explicitly naming VT 0 is illegal, but using * /dev/tty0 (fg_console) as a target is legal, since an implicit aliasing * to the current console is done by the main ioctl code. */ #ifdef CONFIG_X86 #include <asm/syscalls.h> #endif static void complete_change_console(struct vc_data *vc); /* * User space VT_EVENT handlers */ struct vt_event_wait { struct list_head list; struct vt_event event; int done; }; static LIST_HEAD(vt_events); static DEFINE_SPINLOCK(vt_event_lock); static DECLARE_WAIT_QUEUE_HEAD(vt_event_waitqueue); /** * vt_event_post * @event: the event that occurred * @old: old console * @new: new console * * Post an VT event to interested VT handlers */ void vt_event_post(unsigned int event, unsigned int old, unsigned int new) { struct list_head *pos, *head; unsigned long flags; int wake = 0; spin_lock_irqsave(&vt_event_lock, flags); head = &vt_events; list_for_each(pos, head) { struct vt_event_wait *ve = list_entry(pos, struct vt_event_wait, list); if (!(ve->event.event & event)) continue; ve->event.event = event; /* kernel view is consoles 0..n-1, user space view is console 1..n with 0 meaning current, so we must bias */ ve->event.oldev = old + 1; ve->event.newev = new + 1; wake = 1; ve->done = 1; } spin_unlock_irqrestore(&vt_event_lock, flags); if (wake) wake_up_interruptible(&vt_event_waitqueue); } static void __vt_event_queue(struct vt_event_wait *vw) { unsigned long flags; /* Prepare the event */ INIT_LIST_HEAD(&vw->list); vw->done = 0; /* Queue our event */ spin_lock_irqsave(&vt_event_lock, flags); list_add(&vw->list, &vt_events); spin_unlock_irqrestore(&vt_event_lock, flags); } static void __vt_event_wait(struct vt_event_wait *vw) { /* Wait for it to pass */ wait_event_interruptible(vt_event_waitqueue, vw->done); } static void __vt_event_dequeue(struct vt_event_wait *vw) { unsigned long flags; /* Dequeue it */ spin_lock_irqsave(&vt_event_lock, flags); list_del(&vw->list); spin_unlock_irqrestore(&vt_event_lock, flags); } /** * vt_event_wait - wait for an event * @vw: our event * * Waits for an event to occur which completes our vt_event_wait * structure. On return the structure has wv->done set to 1 for success * or 0 if some event such as a signal ended the wait. */ static void vt_event_wait(struct vt_event_wait *vw) { __vt_event_queue(vw); __vt_event_wait(vw); __vt_event_dequeue(vw); } /** * vt_event_wait_ioctl - event ioctl handler * @event: argument to ioctl (the event) * * Implement the VT_WAITEVENT ioctl using the VT event interface */ static int vt_event_wait_ioctl(struct vt_event __user *event) { struct vt_event_wait vw; if (copy_from_user(&vw.event, event, sizeof(struct vt_event))) return -EFAULT; /* Highest supported event for now */ if (vw.event.event & ~VT_MAX_EVENT) return -EINVAL; vt_event_wait(&vw); /* If it occurred report it */ if (vw.done) { if (copy_to_user(event, &vw.event, sizeof(struct vt_event))) return -EFAULT; return 0; } return -EINTR; } /** * vt_waitactive - active console wait * @n: new console * * Helper for event waits. Used to implement the legacy * event waiting ioctls in terms of events */ int vt_waitactive(int n) { struct vt_event_wait vw; do { vw.event.event = VT_EVENT_SWITCH; __vt_event_queue(&vw); if (n == fg_console + 1) { __vt_event_dequeue(&vw); break; } __vt_event_wait(&vw); __vt_event_dequeue(&vw); if (vw.done == 0) return -EINTR; } while (vw.event.newev != n); return 0; } /* * these are the valid i/o ports we're allowed to change. they map all the * video ports */ #define GPFIRST 0x3b4 #define GPLAST 0x3df #define GPNUM (GPLAST - GPFIRST + 1) /* * currently, setting the mode from KD_TEXT to KD_GRAPHICS doesn't do a whole * lot. i'm not sure if it should do any restoration of modes or what... * * XXX It should at least call into the driver, fbdev's definitely need to * restore their engine state. --BenH * * Called with the console lock held. */ static int vt_kdsetmode(struct vc_data *vc, unsigned long mode) { switch (mode) { case KD_GRAPHICS: break; case KD_TEXT0: case KD_TEXT1: mode = KD_TEXT; fallthrough; case KD_TEXT: break; default: return -EINVAL; } if (vc->vc_mode == mode) return 0; vc->vc_mode = mode; if (vc->vc_num != fg_console) return 0; /* explicitly blank/unblank the screen if switching modes */ if (mode == KD_TEXT) do_unblank_screen(1); else do_blank_screen(1); return 0; } static int vt_k_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg, bool perm) { struct vc_data *vc = tty->driver_data; void __user *up = (void __user *)arg; unsigned int console = vc->vc_num; int ret; switch (cmd) { case KIOCSOUND: if (!perm) return -EPERM; /* * The use of PIT_TICK_RATE is historic, it used to be * the platform-dependent CLOCK_TICK_RATE between 2.6.12 * and 2.6.36, which was a minor but unfortunate ABI * change. kd_mksound is locked by the input layer. */ if (arg) arg = PIT_TICK_RATE / arg; kd_mksound(arg, 0); break; case KDMKTONE: if (!perm) return -EPERM; { unsigned int ticks, count; /* * Generate the tone for the appropriate number of ticks. * If the time is zero, turn off sound ourselves. */ ticks = msecs_to_jiffies((arg >> 16) & 0xffff); count = ticks ? (arg & 0xffff) : 0; if (count) count = PIT_TICK_RATE / count; kd_mksound(count, ticks); break; } case KDGKBTYPE: /* * this is naïve. */ return put_user(KB_101, (char __user *)arg); /* * These cannot be implemented on any machine that implements * ioperm() in user level (such as Alpha PCs) or not at all. * * XXX: you should never use these, just call ioperm directly.. */ #ifdef CONFIG_X86 case KDADDIO: case KDDELIO: /* * KDADDIO and KDDELIO may be able to add ports beyond what * we reject here, but to be safe... * * These are locked internally via sys_ioperm */ if (arg < GPFIRST || arg > GPLAST) return -EINVAL; return ksys_ioperm(arg, 1, (cmd == KDADDIO)) ? -ENXIO : 0; case KDENABIO: case KDDISABIO: return ksys_ioperm(GPFIRST, GPNUM, (cmd == KDENABIO)) ? -ENXIO : 0; #endif /* Linux m68k/i386 interface for setting the keyboard delay/repeat rate */ case KDKBDREP: { struct kbd_repeat kbrep; if (!capable(CAP_SYS_TTY_CONFIG)) return -EPERM; if (copy_from_user(&kbrep, up, sizeof(struct kbd_repeat))) return -EFAULT; ret = kbd_rate(&kbrep); if (ret) return ret; if (copy_to_user(up, &kbrep, sizeof(struct kbd_repeat))) return -EFAULT; break; } case KDSETMODE: if (!perm) return -EPERM; console_lock(); ret = vt_kdsetmode(vc, arg); console_unlock(); return ret; case KDGETMODE: return put_user(vc->vc_mode, (int __user *)arg); case KDMAPDISP: case KDUNMAPDISP: /* * these work like a combination of mmap and KDENABIO. * this could be easily finished. */ return -EINVAL; case KDSKBMODE: if (!perm) return -EPERM; ret = vt_do_kdskbmode(console, arg); if (ret) return ret; tty_ldisc_flush(tty); break; case KDGKBMODE: return put_user(vt_do_kdgkbmode(console), (int __user *)arg); /* this could be folded into KDSKBMODE, but for compatibility reasons it is not so easy to fold KDGKBMETA into KDGKBMODE */ case KDSKBMETA: return vt_do_kdskbmeta(console, arg); case KDGKBMETA: /* FIXME: should review whether this is worth locking */ return put_user(vt_do_kdgkbmeta(console), (int __user *)arg); case KDGETKEYCODE: case KDSETKEYCODE: if(!capable(CAP_SYS_TTY_CONFIG)) perm = 0; return vt_do_kbkeycode_ioctl(cmd, up, perm); case KDGKBENT: case KDSKBENT: return vt_do_kdsk_ioctl(cmd, up, perm, console); case KDGKBSENT: case KDSKBSENT: return vt_do_kdgkb_ioctl(cmd, up, perm); /* Diacritical processing. Handled in keyboard.c as it has to operate on the keyboard locks and structures */ case KDGKBDIACR: case KDGKBDIACRUC: case KDSKBDIACR: case KDSKBDIACRUC: return vt_do_diacrit(cmd, up, perm); /* the ioctls below read/set the flags usually shown in the leds */ /* don't use them - they will go away without warning */ case KDGKBLED: case KDSKBLED: case KDGETLED: case KDSETLED: return vt_do_kdskled(console, cmd, arg, perm); /* * A process can indicate its willingness to accept signals * generated by pressing an appropriate key combination. * Thus, one can have a daemon that e.g. spawns a new console * upon a keypress and then changes to it. * See also the kbrequest field of inittab(5). */ case KDSIGACCEPT: if (!perm || !capable(CAP_KILL)) return -EPERM; if (!valid_signal(arg) || arg < 1 || arg == SIGKILL) return -EINVAL; spin_lock_irq(&vt_spawn_con.lock); put_pid(vt_spawn_con.pid); vt_spawn_con.pid = get_pid(task_pid(current)); vt_spawn_con.sig = arg; spin_unlock_irq(&vt_spawn_con.lock); break; case KDFONTOP: { struct console_font_op op; if (copy_from_user(&op, up, sizeof(op))) return -EFAULT; if (!perm && op.op != KD_FONT_OP_GET) return -EPERM; ret = con_font_op(vc, &op); if (ret) return ret; if (copy_to_user(up, &op, sizeof(op))) return -EFAULT; break; } default: return -ENOIOCTLCMD; } return 0; } static inline int do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud, bool perm, struct vc_data *vc) { struct unimapdesc tmp; if (copy_from_user(&tmp, user_ud, sizeof tmp)) return -EFAULT; switch (cmd) { case PIO_UNIMAP: if (!perm) return -EPERM; return con_set_unimap(vc, tmp.entry_ct, tmp.entries); case GIO_UNIMAP: if (!perm && fg_console != vc->vc_num) return -EPERM; return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct), tmp.entries); } return 0; } static int vt_io_ioctl(struct vc_data *vc, unsigned int cmd, void __user *up, bool perm) { switch (cmd) { case PIO_CMAP: if (!perm) return -EPERM; return con_set_cmap(up); case GIO_CMAP: return con_get_cmap(up); case PIO_SCRNMAP: if (!perm) return -EPERM; return con_set_trans_old(up); case GIO_SCRNMAP: return con_get_trans_old(up); case PIO_UNISCRNMAP: if (!perm) return -EPERM; return con_set_trans_new(up); case GIO_UNISCRNMAP: return con_get_trans_new(up); case PIO_UNIMAPCLR: if (!perm) return -EPERM; con_clear_unimap(vc); break; case PIO_UNIMAP: case GIO_UNIMAP: return do_unimap_ioctl(cmd, up, perm, vc); default: return -ENOIOCTLCMD; } return 0; } static int vt_reldisp(struct vc_data *vc, unsigned int swtch) { int newvt, ret; if (vc->vt_mode.mode != VT_PROCESS) return -EINVAL; /* Switched-to response */ if (vc->vt_newvt < 0) { /* If it's just an ACK, ignore it */ return swtch == VT_ACKACQ ? 0 : -EINVAL; } /* Switching-from response */ if (swtch == 0) { /* Switch disallowed, so forget we were trying to do it. */ vc->vt_newvt = -1; return 0; } /* The current vt has been released, so complete the switch. */ newvt = vc->vt_newvt; vc->vt_newvt = -1; ret = vc_allocate(newvt); if (ret) return ret; /* * When we actually do the console switch, make sure we are atomic with * respect to other console switches.. */ complete_change_console(vc_cons[newvt].d); return 0; } static int vt_setactivate(struct vt_setactivate __user *sa) { struct vt_setactivate vsa; struct vc_data *nvc; int ret; if (copy_from_user(&vsa, sa, sizeof(vsa))) return -EFAULT; if (vsa.console == 0 || vsa.console > MAX_NR_CONSOLES) return -ENXIO; vsa.console--; vsa.console = array_index_nospec(vsa.console, MAX_NR_CONSOLES); console_lock(); ret = vc_allocate(vsa.console); if (ret) { console_unlock(); return ret; } /* * This is safe providing we don't drop the console sem between * vc_allocate and finishing referencing nvc. */ nvc = vc_cons[vsa.console].d; nvc->vt_mode = vsa.mode; nvc->vt_mode.frsig = 0; put_pid(nvc->vt_pid); nvc->vt_pid = get_pid(task_pid(current)); console_unlock(); /* Commence switch and lock */ /* Review set_console locks */ set_console(vsa.console); return 0; } /* deallocate a single console, if possible (leave 0) */ static int vt_disallocate(unsigned int vc_num) { struct vc_data *vc = NULL; int ret = 0; console_lock(); if (vt_busy(vc_num)) ret = -EBUSY; else if (vc_num) vc = vc_deallocate(vc_num); console_unlock(); if (vc && vc_num >= MIN_NR_CONSOLES) tty_port_put(&vc->port); return ret; } /* deallocate all unused consoles, but leave 0 */ static void vt_disallocate_all(void) { struct vc_data *vc[MAX_NR_CONSOLES]; int i; console_lock(); for (i = 1; i < MAX_NR_CONSOLES; i++) if (!vt_busy(i)) vc[i] = vc_deallocate(i); else vc[i] = NULL; console_unlock(); for (i = 1; i < MAX_NR_CONSOLES; i++) { if (vc[i] && i >= MIN_NR_CONSOLES) tty_port_put(&vc[i]->port); } } static int vt_resizex(struct vc_data *vc, struct vt_consize __user *cs) { struct vt_consize v; int i; if (copy_from_user(&v, cs, sizeof(struct vt_consize))) return -EFAULT; /* FIXME: Should check the copies properly */ if (!v.v_vlin) v.v_vlin = vc->vc_scan_lines; if (v.v_clin) { int rows = v.v_vlin / v.v_clin; if (v.v_rows != rows) { if (v.v_rows) /* Parameters don't add up */ return -EINVAL; v.v_rows = rows; } } if (v.v_vcol && v.v_ccol) { int cols = v.v_vcol / v.v_ccol; if (v.v_cols != cols) { if (v.v_cols) return -EINVAL; v.v_cols = cols; } } if (v.v_clin > 32) return -EINVAL; for (i = 0; i < MAX_NR_CONSOLES; i++) { struct vc_data *vcp; if (!vc_cons[i].d) continue; console_lock(); vcp = vc_cons[i].d; if (vcp) { int ret; int save_scan_lines = vcp->vc_scan_lines; int save_cell_height = vcp->vc_cell_height; if (v.v_vlin) vcp->vc_scan_lines = v.v_vlin; if (v.v_clin) vcp->vc_cell_height = v.v_clin; vcp->vc_resize_user = 1; ret = vc_resize(vcp, v.v_cols, v.v_rows); if (ret) { vcp->vc_scan_lines = save_scan_lines; vcp->vc_cell_height = save_cell_height; console_unlock(); return ret; } } console_unlock(); } return 0; } /* * We handle the console-specific ioctl's here. We allow the * capability to modify any console, not just the fg_console. */ int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct vc_data *vc = tty->driver_data; void __user *up = (void __user *)arg; int i, perm; int ret; /* * To have permissions to do most of the vt ioctls, we either have * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG. */ perm = 0; if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG)) perm = 1; ret = vt_k_ioctl(tty, cmd, arg, perm); if (ret != -ENOIOCTLCMD) return ret; ret = vt_io_ioctl(vc, cmd, up, perm); if (ret != -ENOIOCTLCMD) return ret; switch (cmd) { case TIOCLINUX: return tioclinux(tty, arg); case VT_SETMODE: { struct vt_mode tmp; if (!perm) return -EPERM; if (copy_from_user(&tmp, up, sizeof(struct vt_mode))) return -EFAULT; if (tmp.mode != VT_AUTO && tmp.mode != VT_PROCESS) return -EINVAL; console_lock(); vc->vt_mode = tmp; /* the frsig is ignored, so we set it to 0 */ vc->vt_mode.frsig = 0; put_pid(vc->vt_pid); vc->vt_pid = get_pid(task_pid(current)); /* no switch is required -- saw@shade.msu.ru */ vc->vt_newvt = -1; console_unlock(); break; } case VT_GETMODE: { struct vt_mode tmp; int rc; console_lock(); memcpy(&tmp, &vc->vt_mode, sizeof(struct vt_mode)); console_unlock(); rc = copy_to_user(up, &tmp, sizeof(struct vt_mode)); if (rc) return -EFAULT; break; } /* * Returns global vt state. Note that VT 0 is always open, since * it's an alias for the current VT, and people can't use it here. * We cannot return state for more than 16 VTs, since v_state is short. */ case VT_GETSTATE: { struct vt_stat __user *vtstat = up; unsigned short state, mask; if (put_user(fg_console + 1, &vtstat->v_active)) return -EFAULT; state = 1; /* /dev/tty0 is always open */ console_lock(); /* required by vt_in_use() */ for (i = 0, mask = 2; i < MAX_NR_CONSOLES && mask; ++i, mask <<= 1) if (vt_in_use(i)) state |= mask; console_unlock(); return put_user(state, &vtstat->v_state); } /* * Returns the first available (non-opened) console. */ case VT_OPENQRY: console_lock(); /* required by vt_in_use() */ for (i = 0; i < MAX_NR_CONSOLES; ++i) if (!vt_in_use(i)) break; console_unlock(); i = i < MAX_NR_CONSOLES ? (i+1) : -1; return put_user(i, (int __user *)arg); /* * ioctl(fd, VT_ACTIVATE, num) will cause us to switch to vt # num, * with num >= 1 (switches to vt 0, our console, are not allowed, just * to preserve sanity). */ case VT_ACTIVATE: if (!perm) return -EPERM; if (arg == 0 || arg > MAX_NR_CONSOLES) return -ENXIO; arg--; arg = array_index_nospec(arg, MAX_NR_CONSOLES); console_lock(); ret = vc_allocate(arg); console_unlock(); if (ret) return ret; set_console(arg); break; case VT_SETACTIVATE: if (!perm) return -EPERM; return vt_setactivate(up); /* * wait until the specified VT has been activated */ case VT_WAITACTIVE: if (!perm) return -EPERM; if (arg == 0 || arg > MAX_NR_CONSOLES) return -ENXIO; return vt_waitactive(arg); /* * If a vt is under process control, the kernel will not switch to it * immediately, but postpone the operation until the process calls this * ioctl, allowing the switch to complete. * * According to the X sources this is the behavior: * 0: pending switch-from not OK * 1: pending switch-from OK * 2: completed switch-to OK */ case VT_RELDISP: if (!perm) return -EPERM; console_lock(); ret = vt_reldisp(vc, arg); console_unlock(); return ret; /* * Disallocate memory associated to VT (but leave VT1) */ case VT_DISALLOCATE: if (arg > MAX_NR_CONSOLES) return -ENXIO; if (arg == 0) vt_disallocate_all(); else return vt_disallocate(--arg); break; case VT_RESIZE: { struct vt_sizes __user *vtsizes = up; struct vc_data *vc; ushort ll,cc; if (!perm) return -EPERM; if (get_user(ll, &vtsizes->v_rows) || get_user(cc, &vtsizes->v_cols)) return -EFAULT; console_lock(); for (i = 0; i < MAX_NR_CONSOLES; i++) { vc = vc_cons[i].d; if (vc) { vc->vc_resize_user = 1; /* FIXME: review v tty lock */ vc_resize(vc_cons[i].d, cc, ll); } } console_unlock(); break; } case VT_RESIZEX: if (!perm) return -EPERM; return vt_resizex(vc, up); case VT_LOCKSWITCH: if (!capable(CAP_SYS_TTY_CONFIG)) return -EPERM; vt_dont_switch = true; break; case VT_UNLOCKSWITCH: if (!capable(CAP_SYS_TTY_CONFIG)) return -EPERM; vt_dont_switch = false; break; case VT_GETHIFONTMASK: return put_user(vc->vc_hi_font_mask, (unsigned short __user *)arg); case VT_WAITEVENT: return vt_event_wait_ioctl((struct vt_event __user *)arg); default: return -ENOIOCTLCMD; } return 0; } void reset_vc(struct vc_data *vc) { vc->vc_mode = KD_TEXT; vt_reset_unicode(vc->vc_num); vc->vt_mode.mode = VT_AUTO; vc->vt_mode.waitv = 0; vc->vt_mode.relsig = 0; vc->vt_mode.acqsig = 0; vc->vt_mode.frsig = 0; put_pid(vc->vt_pid); vc->vt_pid = NULL; vc->vt_newvt = -1; if (!in_interrupt()) /* Via keyboard.c:SAK() - akpm */ reset_palette(vc); } void vc_SAK(struct work_struct *work) { struct vc *vc_con = container_of(work, struct vc, SAK_work); struct vc_data *vc; struct tty_struct *tty; console_lock(); vc = vc_con->d; if (vc) { /* FIXME: review tty ref counting */ tty = vc->port.tty; /* * SAK should also work in all raw modes and reset * them properly. */ if (tty) __do_SAK(tty); reset_vc(vc); } console_unlock(); } #ifdef CONFIG_COMPAT struct compat_console_font_op { compat_uint_t op; /* operation code KD_FONT_OP_* */ compat_uint_t flags; /* KD_FONT_FLAG_* */ compat_uint_t width, height; /* font size */ compat_uint_t charcount; compat_caddr_t data; /* font data with height fixed to 32 */ }; static inline int compat_kdfontop_ioctl(struct compat_console_font_op __user *fontop, int perm, struct console_font_op *op, struct vc_data *vc) { int i; if (copy_from_user(op, fontop, sizeof(struct compat_console_font_op))) return -EFAULT; if (!perm && op->op != KD_FONT_OP_GET) return -EPERM; op->data = compat_ptr(((struct compat_console_font_op *)op)->data); i = con_font_op(vc, op); if (i) return i; ((struct compat_console_font_op *)op)->data = (unsigned long)op->data; if (copy_to_user(fontop, op, sizeof(struct compat_console_font_op))) return -EFAULT; return 0; } struct compat_unimapdesc { unsigned short entry_ct; compat_caddr_t entries; }; static inline int compat_unimap_ioctl(unsigned int cmd, struct compat_unimapdesc __user *user_ud, int perm, struct vc_data *vc) { struct compat_unimapdesc tmp; struct unipair __user *tmp_entries; if (copy_from_user(&tmp, user_ud, sizeof tmp)) return -EFAULT; tmp_entries = compat_ptr(tmp.entries); switch (cmd) { case PIO_UNIMAP: if (!perm) return -EPERM; return con_set_unimap(vc, tmp.entry_ct, tmp_entries); case GIO_UNIMAP: if (!perm && fg_console != vc->vc_num) return -EPERM; return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct), tmp_entries); } return 0; } long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct vc_data *vc = tty->driver_data; struct console_font_op op; /* used in multiple places here */ void __user *up = compat_ptr(arg); int perm; /* * To have permissions to do most of the vt ioctls, we either have * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG. */ perm = 0; if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG)) perm = 1; switch (cmd) { /* * these need special handlers for incompatible data structures */ case KDFONTOP: return compat_kdfontop_ioctl(up, perm, &op, vc); case PIO_UNIMAP: case GIO_UNIMAP: return compat_unimap_ioctl(cmd, up, perm, vc); /* * all these treat 'arg' as an integer */ case KIOCSOUND: case KDMKTONE: #ifdef CONFIG_X86 case KDADDIO: case KDDELIO: #endif case KDSETMODE: case KDMAPDISP: case KDUNMAPDISP: case KDSKBMODE: case KDSKBMETA: case KDSKBLED: case KDSETLED: case KDSIGACCEPT: case VT_ACTIVATE: case VT_WAITACTIVE: case VT_RELDISP: case VT_DISALLOCATE: case VT_RESIZE: case VT_RESIZEX: return vt_ioctl(tty, cmd, arg); /* * the rest has a compatible data structure behind arg, * but we have to convert it to a proper 64 bit pointer. */ default: return vt_ioctl(tty, cmd, (unsigned long)up); } } #endif /* CONFIG_COMPAT */ /* * Performs the back end of a vt switch. Called under the console * semaphore. */ static void complete_change_console(struct vc_data *vc) { unsigned char old_vc_mode; int old = fg_console; last_console = fg_console; /* * If we're switching, we could be going from KD_GRAPHICS to * KD_TEXT mode or vice versa, which means we need to blank or * unblank the screen later. */ old_vc_mode = vc_cons[fg_console].d->vc_mode; switch_screen(vc); /* * This can't appear below a successful kill_pid(). If it did, * then the *blank_screen operation could occur while X, having * received acqsig, is waking up on another processor. This * condition can lead to overlapping accesses to the VGA range * and the framebuffer (causing system lockups). * * To account for this we duplicate this code below only if the * controlling process is gone and we've called reset_vc. */ if (old_vc_mode != vc->vc_mode) { if (vc->vc_mode == KD_TEXT) do_unblank_screen(1); else do_blank_screen(1); } /* * If this new console is under process control, send it a signal * telling it that it has acquired. Also check if it has died and * clean up (similar to logic employed in change_console()) */ if (vc->vt_mode.mode == VT_PROCESS) { /* * Send the signal as privileged - kill_pid() will * tell us if the process has gone or something else * is awry */ if (kill_pid(vc->vt_pid, vc->vt_mode.acqsig, 1) != 0) { /* * The controlling process has died, so we revert back to * normal operation. In this case, we'll also change back * to KD_TEXT mode. I'm not sure if this is strictly correct * but it saves the agony when the X server dies and the screen * remains blanked due to KD_GRAPHICS! It would be nice to do * this outside of VT_PROCESS but there is no single process * to account for and tracking tty count may be undesirable. */ reset_vc(vc); if (old_vc_mode != vc->vc_mode) { if (vc->vc_mode == KD_TEXT) do_unblank_screen(1); else do_blank_screen(1); } } } /* * Wake anyone waiting for their VT to activate */ vt_event_post(VT_EVENT_SWITCH, old, vc->vc_num); return; } /* * Performs the front-end of a vt switch */ void change_console(struct vc_data *new_vc) { struct vc_data *vc; if (!new_vc || new_vc->vc_num == fg_console || vt_dont_switch) return; /* * If this vt is in process mode, then we need to handshake with * that process before switching. Essentially, we store where that * vt wants to switch to and wait for it to tell us when it's done * (via VT_RELDISP ioctl). * * We also check to see if the controlling process still exists. * If it doesn't, we reset this vt to auto mode and continue. * This is a cheap way to track process control. The worst thing * that can happen is: we send a signal to a process, it dies, and * the switch gets "lost" waiting for a response; hopefully, the * user will try again, we'll detect the process is gone (unless * the user waits just the right amount of time :-) and revert the * vt to auto control. */ vc = vc_cons[fg_console].d; if (vc->vt_mode.mode == VT_PROCESS) { /* * Send the signal as privileged - kill_pid() will * tell us if the process has gone or something else * is awry. * * We need to set vt_newvt *before* sending the signal or we * have a race. */ vc->vt_newvt = new_vc->vc_num; if (kill_pid(vc->vt_pid, vc->vt_mode.relsig, 1) == 0) { /* * It worked. Mark the vt to switch to and * return. The process needs to send us a * VT_RELDISP ioctl to complete the switch. */ return; } /* * The controlling process has died, so we revert back to * normal operation. In this case, we'll also change back * to KD_TEXT mode. I'm not sure if this is strictly correct * but it saves the agony when the X server dies and the screen * remains blanked due to KD_GRAPHICS! It would be nice to do * this outside of VT_PROCESS but there is no single process * to account for and tracking tty count may be undesirable. */ reset_vc(vc); /* * Fall through to normal (VT_AUTO) handling of the switch... */ } /* * Ignore all switches in KD_GRAPHICS+VT_AUTO mode */ if (vc->vc_mode == KD_GRAPHICS) return; complete_change_console(new_vc); } /* Perform a kernel triggered VT switch for suspend/resume */ static int disable_vt_switch; int vt_move_to_console(unsigned int vt, int alloc) { int prev; console_lock(); /* Graphics mode - up to X */ if (disable_vt_switch) { console_unlock(); return 0; } prev = fg_console; if (alloc && vc_allocate(vt)) { /* we can't have a free VC for now. Too bad, * we don't want to mess the screen for now. */ console_unlock(); return -ENOSPC; } if (set_console(vt)) { /* * We're unable to switch to the SUSPEND_CONSOLE. * Let the calling function know so it can decide * what to do. */ console_unlock(); return -EIO; } console_unlock(); if (vt_waitactive(vt + 1)) { pr_debug("Suspend: Can't switch VCs."); return -EINTR; } return prev; } /* * Normally during a suspend, we allocate a new console and switch to it. * When we resume, we switch back to the original console. This switch * can be slow, so on systems where the framebuffer can handle restoration * of video registers anyways, there's little point in doing the console * switch. This function allows you to disable it by passing it '0'. */ void pm_set_vt_switch(int do_switch) { console_lock(); disable_vt_switch = !do_switch; console_unlock(); } EXPORT_SYMBOL(pm_set_vt_switch);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <linux/types.h> #include <linux/stddef.h> #include <linux/poison.h> #include <linux/const.h> #include <linux/kernel.h> /* * Simple doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) /** * INIT_LIST_HEAD - Initialize a list_head structure * @list: list_head structure to be initialized. * * Initializes the list_head to point to itself. If it is a list header, * the result is an empty list. */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); list->prev = list; } #ifdef CONFIG_DEBUG_LIST extern bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next); extern bool __list_del_entry_valid(struct list_head *entry); #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { return true; } static inline bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; WRITE_ONCE(prev->next, next); } /* * Delete a list entry and clear the 'prev' pointer. * * This is a special-purpose list clearing method used in the networking code * for lists allocated as per-cpu, where we don't want to incur the extra * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this * needs to check the node 'prev' pointer instead of calling list_empty(). */ static inline void __list_del_clearprev(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->prev = NULL; } static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) return; __list_del(entry->prev, entry->next); } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } /** * list_replace_init - replace old entry by new one and initialize the old one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position * @entry1: the location to place entry2 * @entry2: the location to place entry1 */ static inline void list_swap(struct list_head *entry1, struct list_head *entry2) { struct list_head *pos = entry2->prev; list_del(entry2); list_replace(entry1, entry2); if (pos == entry1) pos = entry2; list_add(entry1, pos); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } /** * list_bulk_move_tail - move a subsection of a list to its tail * @head: the head that will follow our entry * @first: first entry to move * @last: last entry to move, can be the same as first * * Move all entries between @first and including @last before @head. * All three entries must belong to the same linked list. */ static inline void list_bulk_move_tail(struct list_head *head, struct list_head *first, struct list_head *last) { first->prev->next = last->next; last->next->prev = first->prev; head->prev->next = first; first->prev = head->prev; last->next = head; head->prev = last; } /** * list_is_first -- tests whether @list is the first entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return READ_ONCE(head->next) == head; } /** * list_del_init_careful - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. * * This is the same as list_del_init(), except designed to be used * together with list_empty_careful() in a way to guarantee ordering * of other memory operations. * * Any memory operations done before a list_del_init_careful() are * guaranteed to be visible after a list_empty_careful() test. */ static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); entry->prev = entry; smp_store_release(&entry->next, entry); } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); return (next == head) && (next == head->prev); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_rotate_to_front() - Rotate list to specific item. * @list: The desired new front of the list. * @head: The head of the list. * * Rotates list so that @list becomes the new front of the list. */ static inline void list_rotate_to_front(struct list_head *list, struct list_head *head) { /* * Deletes the list head from the list denoted by @head and * places it as the tail of @list, this effectively rotates the * list so that @list is at the front. */ list_move_tail(head, list); } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && (head->next != entry && head != entry)) return; if (entry == head) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } /** * list_cut_before - cut a list into two, before given entry * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * * This helper moves the initial part of @head, up to but * excluding @entry, from @head to @list. You should pass * in @entry an element you know is on @head. @list should * be an empty list or a list you do not care about losing * its data. * If @entry == @head, all entries on @head are moved to * @list. */ static inline void list_cut_before(struct list_head *list, struct list_head *head, struct list_head *entry) { if (head->next == entry) { INIT_LIST_HEAD(list); return; } list->next = head->next; list->next->prev = list; list->prev = entry->prev; list->prev->next = list; head->next = entry; entry->prev = head; } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->next); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_next_entry - get the next element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ for (pos = pos->next; pos != (head); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; pos != (head); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ pos != (head); \ pos = n, n = pos->prev) /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ (&pos->member == (head)) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_from_reverse - iterate backwards over list of given type * from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } /** * hlist_unhashed - Has node been removed from list and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed * state. For example, hlist_nulls_del_init_rcu() does leave the * node in unhashed state, but hlist_nulls_del() does not. */ static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } /** * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use * @h: Node to be checked * * This variant of hlist_unhashed() must be used in lockless contexts * to avoid potential load-tearing. The READ_ONCE() is paired with the * various WRITE_ONCE() in hlist helpers that are defined below. */ static inline int hlist_unhashed_lockless(const struct hlist_node *h) { return !READ_ONCE(h->pprev); } /** * hlist_empty - Is the specified hlist_head structure an empty hlist? * @h: Structure to check. */ static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (next) WRITE_ONCE(next->pprev, pprev); } /** * hlist_del - Delete the specified hlist_node from its list * @n: Node to delete. * * Note that this function leaves the node in hashed state. Use * hlist_del_init() or similar instead to unhash @n. */ static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } /** * hlist_del_init - Delete the specified hlist_node from its list and initialize * @n: Node to delete. * * Note that this function leaves the node in unhashed state. */ static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } /** * hlist_add_head - add a new entry at the beginning of the hlist * @n: new entry to be added * @h: hlist head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; WRITE_ONCE(n->next, first); if (first) WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); WRITE_ONCE(n->pprev, &h->first); } /** * hlist_add_before - add a new entry before the one specified * @n: new entry to be added * @next: hlist node to add it before, which must be non-NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); WRITE_ONCE(n->next, next); WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); } /** * hlist_add_behing - add a new entry after the one specified * @n: new entry to be added * @prev: hlist node to add it after, which must be non-NULL */ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { WRITE_ONCE(n->next, prev->next); WRITE_ONCE(prev->next, n); WRITE_ONCE(n->pprev, &prev->next); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } /** * hlist_add_fake - create a fake hlist consisting of a single headless node * @n: Node to make a fake list out of * * This makes @n appear to be its own predecessor on a headless hlist. * The point of this is to allow things like hlist_del() to work correctly * in cases where there is no list. */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } /** * hlist_fake: Is this node a fake hlist? * @h: Node to check for being a self-referential fake hlist. */ static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } /** * hlist_is_singular_node - is node the only element of the specified hlist? * @n: Node to check for singularity. * @h: Header for potentially singular list. * * Check whether the node is the only node of the head without * accessing head, thus avoiding unnecessary cache misses. */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) { return !n->next && n->pprev == &h->first; } /** * hlist_move_list - Move an hlist * @old: hlist_head for old list. * @new: hlist_head for new list. * * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ }) /** * hlist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: a &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) #endif
8 8 4 1 8 8 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 // SPDX-License-Identifier: GPL-2.0-or-later /* * Security plug functions * * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com> * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com> * Copyright (C) 2016 Mellanox Technologies */ #define pr_fmt(fmt) "LSM: " fmt #include <linux/bpf.h> #include <linux/capability.h> #include <linux/dcache.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> #include <linux/lsm_hooks.h> #include <linux/integrity.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsnotify.h> #include <linux/mman.h> #include <linux/mount.h> #include <linux/personality.h> #include <linux/backing-dev.h> #include <linux/string.h> #include <linux/msg.h> #include <net/flow.h> #define MAX_LSM_EVM_XATTR 2 /* How many LSMs were built into the kernel? */ #define LSM_COUNT (__end_lsm_info - __start_lsm_info) /* * These are descriptions of the reasons that can be passed to the * security_locked_down() LSM hook. Placing this array here allows * all security modules to use the same descriptions for auditing * purposes. */ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", [LOCKDOWN_EFI_TEST] = "/dev/efi_test access", [LOCKDOWN_KEXEC] = "kexec of unsigned images", [LOCKDOWN_HIBERNATION] = "hibernation", [LOCKDOWN_PCI_ACCESS] = "direct PCI access", [LOCKDOWN_IOPORT] = "raw io port access", [LOCKDOWN_MSR] = "raw MSR access", [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_DEBUGFS] = "debugfs access", [LOCKDOWN_XMON_WR] = "xmon write access", [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM", [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM", [LOCKDOWN_PERF] = "unsafe use of perf", [LOCKDOWN_TRACEFS] = "use of tracefs", [LOCKDOWN_XMON_RW] = "xmon read and write access", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; struct security_hook_heads security_hook_heads __lsm_ro_after_init; static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain); static struct kmem_cache *lsm_file_cache; static struct kmem_cache *lsm_inode_cache; char *lsm_names; static struct lsm_blob_sizes blob_sizes __lsm_ro_after_init; /* Boot-time LSM user choice */ static __initdata const char *chosen_lsm_order; static __initdata const char *chosen_major_lsm; static __initconst const char * const builtin_lsm_order = CONFIG_LSM; /* Ordered list of LSMs to initialize. */ static __initdata struct lsm_info **ordered_lsms; static __initdata struct lsm_info *exclusive; static __initdata bool debug; #define init_debug(...) \ do { \ if (debug) \ pr_info(__VA_ARGS__); \ } while (0) static bool __init is_enabled(struct lsm_info *lsm) { if (!lsm->enabled) return false; return *lsm->enabled; } /* Mark an LSM's enabled flag. */ static int lsm_enabled_true __initdata = 1; static int lsm_enabled_false __initdata = 0; static void __init set_enabled(struct lsm_info *lsm, bool enabled) { /* * When an LSM hasn't configured an enable variable, we can use * a hard-coded location for storing the default enabled state. */ if (!lsm->enabled) { if (enabled) lsm->enabled = &lsm_enabled_true; else lsm->enabled = &lsm_enabled_false; } else if (lsm->enabled == &lsm_enabled_true) { if (!enabled) lsm->enabled = &lsm_enabled_false; } else if (lsm->enabled == &lsm_enabled_false) { if (enabled) lsm->enabled = &lsm_enabled_true; } else { *lsm->enabled = enabled; } } /* Is an LSM already listed in the ordered LSMs list? */ static bool __init exists_ordered_lsm(struct lsm_info *lsm) { struct lsm_info **check; for (check = ordered_lsms; *check; check++) if (*check == lsm) return true; return false; } /* Append an LSM to the list of ordered LSMs to initialize. */ static int last_lsm __initdata; static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from) { /* Ignore duplicate selections. */ if (exists_ordered_lsm(lsm)) return; if (WARN(last_lsm == LSM_COUNT, "%s: out of LSM slots!?\n", from)) return; /* Enable this LSM, if it is not already set. */ if (!lsm->enabled) lsm->enabled = &lsm_enabled_true; ordered_lsms[last_lsm++] = lsm; init_debug("%s ordering: %s (%sabled)\n", from, lsm->name, is_enabled(lsm) ? "en" : "dis"); } /* Is an LSM allowed to be initialized? */ static bool __init lsm_allowed(struct lsm_info *lsm) { /* Skip if the LSM is disabled. */ if (!is_enabled(lsm)) return false; /* Not allowed if another exclusive LSM already initialized. */ if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) { init_debug("exclusive disabled: %s\n", lsm->name); return false; } return true; } static void __init lsm_set_blob_size(int *need, int *lbs) { int offset; if (*need > 0) { offset = *lbs; *lbs += *need; *need = offset; } } static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) { if (!needed) return; lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred); lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file); /* * The inode blob gets an rcu_head in addition to * what the modules might need. */ if (needed->lbs_inode && blob_sizes.lbs_inode == 0) blob_sizes.lbs_inode = sizeof(struct rcu_head); lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode); lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc); lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg); lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task); } /* Prepare LSM for initialization. */ static void __init prepare_lsm(struct lsm_info *lsm) { int enabled = lsm_allowed(lsm); /* Record enablement (to handle any following exclusive LSMs). */ set_enabled(lsm, enabled); /* If enabled, do pre-initialization work. */ if (enabled) { if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) { exclusive = lsm; init_debug("exclusive chosen: %s\n", lsm->name); } lsm_set_blob_sizes(lsm->blobs); } } /* Initialize a given LSM, if it is enabled. */ static void __init initialize_lsm(struct lsm_info *lsm) { if (is_enabled(lsm)) { int ret; init_debug("initializing %s\n", lsm->name); ret = lsm->init(); WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret); } } /* Populate ordered LSMs list from comma-separated LSM name list. */ static void __init ordered_lsm_parse(const char *order, const char *origin) { struct lsm_info *lsm; char *sep, *name, *next; /* LSM_ORDER_FIRST is always first. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (lsm->order == LSM_ORDER_FIRST) append_ordered_lsm(lsm, "first"); } /* Process "security=", if given. */ if (chosen_major_lsm) { struct lsm_info *major; /* * To match the original "security=" behavior, this * explicitly does NOT fallback to another Legacy Major * if the selected one was separately disabled: disable * all non-matching Legacy Major LSMs. */ for (major = __start_lsm_info; major < __end_lsm_info; major++) { if ((major->flags & LSM_FLAG_LEGACY_MAJOR) && strcmp(major->name, chosen_major_lsm) != 0) { set_enabled(major, false); init_debug("security=%s disabled: %s\n", chosen_major_lsm, major->name); } } } sep = kstrdup(order, GFP_KERNEL); next = sep; /* Walk the list, looking for matching LSMs. */ while ((name = strsep(&next, ",")) != NULL) { bool found = false; for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (lsm->order == LSM_ORDER_MUTABLE && strcmp(lsm->name, name) == 0) { append_ordered_lsm(lsm, origin); found = true; } } if (!found) init_debug("%s ignored: %s\n", origin, name); } /* Process "security=", if given. */ if (chosen_major_lsm) { for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (exists_ordered_lsm(lsm)) continue; if (strcmp(lsm->name, chosen_major_lsm) == 0) append_ordered_lsm(lsm, "security="); } } /* Disable all LSMs not in the ordered list. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (exists_ordered_lsm(lsm)) continue; set_enabled(lsm, false); init_debug("%s disabled: %s\n", origin, lsm->name); } kfree(sep); } static void __init lsm_early_cred(struct cred *cred); static void __init lsm_early_task(struct task_struct *task); static int lsm_append(const char *new, char **result); static void __init ordered_lsm_init(void) { struct lsm_info **lsm; ordered_lsms = kcalloc(LSM_COUNT + 1, sizeof(*ordered_lsms), GFP_KERNEL); if (chosen_lsm_order) { if (chosen_major_lsm) { pr_info("security= is ignored because it is superseded by lsm=\n"); chosen_major_lsm = NULL; } ordered_lsm_parse(chosen_lsm_order, "cmdline"); } else ordered_lsm_parse(builtin_lsm_order, "builtin"); for (lsm = ordered_lsms; *lsm; lsm++) prepare_lsm(*lsm); init_debug("cred blob size = %d\n", blob_sizes.lbs_cred); init_debug("file blob size = %d\n", blob_sizes.lbs_file); init_debug("inode blob size = %d\n", blob_sizes.lbs_inode); init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc); init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg); init_debug("task blob size = %d\n", blob_sizes.lbs_task); /* * Create any kmem_caches needed for blobs */ if (blob_sizes.lbs_file) lsm_file_cache = kmem_cache_create("lsm_file_cache", blob_sizes.lbs_file, 0, SLAB_PANIC, NULL); if (blob_sizes.lbs_inode) lsm_inode_cache = kmem_cache_create("lsm_inode_cache", blob_sizes.lbs_inode, 0, SLAB_PANIC, NULL); lsm_early_cred((struct cred *) current->cred); lsm_early_task(current); for (lsm = ordered_lsms; *lsm; lsm++) initialize_lsm(*lsm); kfree(ordered_lsms); } int __init early_security_init(void) { int i; struct hlist_head *list = (struct hlist_head *) &security_hook_heads; struct lsm_info *lsm; for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head); i++) INIT_HLIST_HEAD(&list[i]); for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { if (!lsm->enabled) lsm->enabled = &lsm_enabled_true; prepare_lsm(lsm); initialize_lsm(lsm); } return 0; } /** * security_init - initializes the security framework * * This should be called early in the kernel initialization sequence. */ int __init security_init(void) { struct lsm_info *lsm; pr_info("Security Framework initializing\n"); /* * Append the names of the early LSM modules now that kmalloc() is * available */ for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { if (lsm->enabled) lsm_append(lsm->name, &lsm_names); } /* Load LSMs in specified order. */ ordered_lsm_init(); return 0; } /* Save user chosen LSM */ static int __init choose_major_lsm(char *str) { chosen_major_lsm = str; return 1; } __setup("security=", choose_major_lsm); /* Explicitly choose LSM initialization order. */ static int __init choose_lsm_order(char *str) { chosen_lsm_order = str; return 1; } __setup("lsm=", choose_lsm_order); /* Enable LSM order debugging. */ static int __init enable_debug(char *str) { debug = true; return 1; } __setup("lsm.debug", enable_debug); static bool match_last_lsm(const char *list, const char *lsm) { const char *last; if (WARN_ON(!list || !lsm)) return false; last = strrchr(list, ','); if (last) /* Pass the comma, strcmp() will check for '\0' */ last++; else last = list; return !strcmp(last, lsm); } static int lsm_append(const char *new, char **result) { char *cp; if (*result == NULL) { *result = kstrdup(new, GFP_KERNEL); if (*result == NULL) return -ENOMEM; } else { /* Check if it is the last registered name */ if (match_last_lsm(*result, new)) return 0; cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new); if (cp == NULL) return -ENOMEM; kfree(*result); *result = cp; } return 0; } /** * security_add_hooks - Add a modules hooks to the hook lists. * @hooks: the hooks to add * @count: the number of hooks to add * @lsm: the name of the security module * * Each LSM has to register its hooks with the infrastructure. */ void __init security_add_hooks(struct security_hook_list *hooks, int count, char *lsm) { int i; for (i = 0; i < count; i++) { hooks[i].lsm = lsm; hlist_add_tail_rcu(&hooks[i].list, hooks[i].head); } /* * Don't try to append during early_security_init(), we'll come back * and fix this up afterwards. */ if (slab_is_available()) { if (lsm_append(lsm, &lsm_names) < 0) panic("%s - Cannot get early memory.\n", __func__); } } int call_blocking_lsm_notifier(enum lsm_event event, void *data) { return blocking_notifier_call_chain(&blocking_lsm_notifier_chain, event, data); } EXPORT_SYMBOL(call_blocking_lsm_notifier); int register_blocking_lsm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&blocking_lsm_notifier_chain, nb); } EXPORT_SYMBOL(register_blocking_lsm_notifier); int unregister_blocking_lsm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain, nb); } EXPORT_SYMBOL(unregister_blocking_lsm_notifier); /** * lsm_cred_alloc - allocate a composite cred blob * @cred: the cred that needs a blob * @gfp: allocation type * * Allocate the cred blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_cred_alloc(struct cred *cred, gfp_t gfp) { if (blob_sizes.lbs_cred == 0) { cred->security = NULL; return 0; } cred->security = kzalloc(blob_sizes.lbs_cred, gfp); if (cred->security == NULL) return -ENOMEM; return 0; } /** * lsm_early_cred - during initialization allocate a composite cred blob * @cred: the cred that needs a blob * * Allocate the cred blob for all the modules */ static void __init lsm_early_cred(struct cred *cred) { int rc = lsm_cred_alloc(cred, GFP_KERNEL); if (rc) panic("%s: Early cred alloc failed.\n", __func__); } /** * lsm_file_alloc - allocate a composite file blob * @file: the file that needs a blob * * Allocate the file blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_file_alloc(struct file *file) { if (!lsm_file_cache) { file->f_security = NULL; return 0; } file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL); if (file->f_security == NULL) return -ENOMEM; return 0; } /** * lsm_inode_alloc - allocate a composite inode blob * @inode: the inode that needs a blob * * Allocate the inode blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ int lsm_inode_alloc(struct inode *inode) { if (!lsm_inode_cache) { inode->i_security = NULL; return 0; } inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS); if (inode->i_security == NULL) return -ENOMEM; return 0; } /** * lsm_task_alloc - allocate a composite task blob * @task: the task that needs a blob * * Allocate the task blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_task_alloc(struct task_struct *task) { if (blob_sizes.lbs_task == 0) { task->security = NULL; return 0; } task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL); if (task->security == NULL) return -ENOMEM; return 0; } /** * lsm_ipc_alloc - allocate a composite ipc blob * @kip: the ipc that needs a blob * * Allocate the ipc blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_ipc_alloc(struct kern_ipc_perm *kip) { if (blob_sizes.lbs_ipc == 0) { kip->security = NULL; return 0; } kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL); if (kip->security == NULL) return -ENOMEM; return 0; } /** * lsm_msg_msg_alloc - allocate a composite msg_msg blob * @mp: the msg_msg that needs a blob * * Allocate the ipc blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_msg_msg_alloc(struct msg_msg *mp) { if (blob_sizes.lbs_msg_msg == 0) { mp->security = NULL; return 0; } mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL); if (mp->security == NULL) return -ENOMEM; return 0; } /** * lsm_early_task - during initialization allocate a composite task blob * @task: the task that needs a blob * * Allocate the task blob for all the modules */ static void __init lsm_early_task(struct task_struct *task) { int rc = lsm_task_alloc(task); if (rc) panic("%s: Early task alloc failed.\n", __func__); } /* * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and * can be accessed with: * * LSM_RET_DEFAULT(<hook_name>) * * The macros below define static constants for the default value of each * LSM hook. */ #define LSM_RET_DEFAULT(NAME) (NAME##_default) #define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME) #define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \ static const int LSM_RET_DEFAULT(NAME) = (DEFAULT); #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME) #include <linux/lsm_hook_defs.h> #undef LSM_HOOK /* * Hook list operation macros. * * call_void_hook: * This is a hook that does not return a value. * * call_int_hook: * This is a hook that returns a value. */ #define call_void_hook(FUNC, ...) \ do { \ struct security_hook_list *P; \ \ hlist_for_each_entry(P, &security_hook_heads.FUNC, list) \ P->hook.FUNC(__VA_ARGS__); \ } while (0) #define call_int_hook(FUNC, IRC, ...) ({ \ int RC = IRC; \ do { \ struct security_hook_list *P; \ \ hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \ RC = P->hook.FUNC(__VA_ARGS__); \ if (RC != 0) \ break; \ } \ } while (0); \ RC; \ }) /* Security operations */ int security_binder_set_context_mgr(const struct cred *mgr) { return call_int_hook(binder_set_context_mgr, 0, mgr); } int security_binder_transaction(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transaction, 0, from, to); } int security_binder_transfer_binder(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transfer_binder, 0, from, to); } int security_binder_transfer_file(const struct cred *from, const struct cred *to, struct file *file) { return call_int_hook(binder_transfer_file, 0, from, to, file); } int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { return call_int_hook(ptrace_access_check, 0, child, mode); } int security_ptrace_traceme(struct task_struct *parent) { return call_int_hook(ptrace_traceme, 0, parent); } int security_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { return call_int_hook(capget, 0, target, effective, inheritable, permitted); } int security_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { return call_int_hook(capset, 0, new, old, effective, inheritable, permitted); } int security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { return call_int_hook(capable, 0, cred, ns, cap, opts); } int security_quotactl(int cmds, int type, int id, struct super_block *sb) { return call_int_hook(quotactl, 0, cmds, type, id, sb); } int security_quota_on(struct dentry *dentry) { return call_int_hook(quota_on, 0, dentry); } int security_syslog(int type) { return call_int_hook(syslog, 0, type); } int security_settime64(const struct timespec64 *ts, const struct timezone *tz) { return call_int_hook(settime, 0, ts, tz); } int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { struct security_hook_list *hp; int cap_sys_admin = 1; int rc; /* * The module will respond with a positive value if * it thinks the __vm_enough_memory() call should be * made with the cap_sys_admin set. If all of the modules * agree that it should be set it will. If any module * thinks it should not be set it won't. */ hlist_for_each_entry(hp, &security_hook_heads.vm_enough_memory, list) { rc = hp->hook.vm_enough_memory(mm, pages); if (rc <= 0) { cap_sys_admin = 0; break; } } return __vm_enough_memory(mm, pages, cap_sys_admin); } int security_bprm_creds_for_exec(struct linux_binprm *bprm) { return call_int_hook(bprm_creds_for_exec, 0, bprm); } int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file) { return call_int_hook(bprm_creds_from_file, 0, bprm, file); } int security_bprm_check(struct linux_binprm *bprm) { int ret; ret = call_int_hook(bprm_check_security, 0, bprm); if (ret) return ret; return ima_bprm_check(bprm); } void security_bprm_committing_creds(struct linux_binprm *bprm) { call_void_hook(bprm_committing_creds, bprm); } void security_bprm_committed_creds(struct linux_binprm *bprm) { call_void_hook(bprm_committed_creds, bprm); } int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { return call_int_hook(fs_context_dup, 0, fc, src_fc); } int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct security_hook_list *hp; int trc; int rc = -ENOPARAM; hlist_for_each_entry(hp, &security_hook_heads.fs_context_parse_param, list) { trc = hp->hook.fs_context_parse_param(fc, param); if (trc == 0) rc = 0; else if (trc != -ENOPARAM) return trc; } return rc; } int security_sb_alloc(struct super_block *sb) { return call_int_hook(sb_alloc_security, 0, sb); } void security_sb_free(struct super_block *sb) { call_void_hook(sb_free_security, sb); } void security_free_mnt_opts(void **mnt_opts) { if (!*mnt_opts) return; call_void_hook(sb_free_mnt_opts, *mnt_opts); *mnt_opts = NULL; } EXPORT_SYMBOL(security_free_mnt_opts); int security_sb_eat_lsm_opts(char *options, void **mnt_opts) { return call_int_hook(sb_eat_lsm_opts, 0, options, mnt_opts); } EXPORT_SYMBOL(security_sb_eat_lsm_opts); int security_sb_remount(struct super_block *sb, void *mnt_opts) { return call_int_hook(sb_remount, 0, sb, mnt_opts); } EXPORT_SYMBOL(security_sb_remount); int security_sb_kern_mount(struct super_block *sb) { return call_int_hook(sb_kern_mount, 0, sb); } int security_sb_show_options(struct seq_file *m, struct super_block *sb) { return call_int_hook(sb_show_options, 0, m, sb); } int security_sb_statfs(struct dentry *dentry) { return call_int_hook(sb_statfs, 0, dentry); } int security_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { return call_int_hook(sb_mount, 0, dev_name, path, type, flags, data); } int security_sb_umount(struct vfsmount *mnt, int flags) { return call_int_hook(sb_umount, 0, mnt, flags); } int security_sb_pivotroot(const struct path *old_path, const struct path *new_path) { return call_int_hook(sb_pivotroot, 0, old_path, new_path); } int security_sb_set_mnt_opts(struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) { return call_int_hook(sb_set_mnt_opts, mnt_opts ? -EOPNOTSUPP : 0, sb, mnt_opts, kern_flags, set_kern_flags); } EXPORT_SYMBOL(security_sb_set_mnt_opts); int security_sb_clone_mnt_opts(const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) { return call_int_hook(sb_clone_mnt_opts, 0, oldsb, newsb, kern_flags, set_kern_flags); } EXPORT_SYMBOL(security_sb_clone_mnt_opts); int security_add_mnt_opt(const char *option, const char *val, int len, void **mnt_opts) { return call_int_hook(sb_add_mnt_opt, -EINVAL, option, val, len, mnt_opts); } EXPORT_SYMBOL(security_add_mnt_opt); int security_move_mount(const struct path *from_path, const struct path *to_path) { return call_int_hook(move_mount, 0, from_path, to_path); } int security_path_notify(const struct path *path, u64 mask, unsigned int obj_type) { return call_int_hook(path_notify, 0, path, mask, obj_type); } int security_inode_alloc(struct inode *inode) { int rc = lsm_inode_alloc(inode); if (unlikely(rc)) return rc; rc = call_int_hook(inode_alloc_security, 0, inode); if (unlikely(rc)) security_inode_free(inode); return rc; } static void inode_free_by_rcu(struct rcu_head *head) { /* * The rcu head is at the start of the inode blob */ kmem_cache_free(lsm_inode_cache, head); } void security_inode_free(struct inode *inode) { integrity_inode_free(inode); call_void_hook(inode_free_security, inode); /* * The inode may still be referenced in a path walk and * a call to security_inode_permission() can be made * after inode_free_security() is called. Ideally, the VFS * wouldn't do this, but fixing that is a much harder * job. For now, simply free the i_security via RCU, and * leave the current inode->i_security pointer intact. * The inode will be freed after the RCU grace period too. */ if (inode->i_security) call_rcu((struct rcu_head *)inode->i_security, inode_free_by_rcu); } int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, void **ctx, u32 *ctxlen) { return call_int_hook(dentry_init_security, -EOPNOTSUPP, dentry, mode, name, ctx, ctxlen); } EXPORT_SYMBOL(security_dentry_init_security); int security_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) { return call_int_hook(dentry_create_files_as, 0, dentry, mode, name, old, new); } EXPORT_SYMBOL(security_dentry_create_files_as); int security_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, const initxattrs initxattrs, void *fs_data) { struct xattr new_xattrs[MAX_LSM_EVM_XATTR + 1]; struct xattr *lsm_xattr, *evm_xattr, *xattr; int ret; if (unlikely(IS_PRIVATE(inode))) return 0; if (!initxattrs) return call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir, qstr, NULL, NULL, NULL); memset(new_xattrs, 0, sizeof(new_xattrs)); lsm_xattr = new_xattrs; ret = call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir, qstr, &lsm_xattr->name, &lsm_xattr->value, &lsm_xattr->value_len); if (ret) goto out; evm_xattr = lsm_xattr + 1; ret = evm_inode_init_security(inode, lsm_xattr, evm_xattr); if (ret) goto out; ret = initxattrs(inode, new_xattrs, fs_data); out: for (xattr = new_xattrs; xattr->value != NULL; xattr++) kfree(xattr->value); return (ret == -EOPNOTSUPP) ? 0 : ret; } EXPORT_SYMBOL(security_inode_init_security); int security_old_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, const char **name, void **value, size_t *len) { if (unlikely(IS_PRIVATE(inode))) return -EOPNOTSUPP; return call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir, qstr, name, value, len); } EXPORT_SYMBOL(security_old_inode_init_security); #ifdef CONFIG_SECURITY_PATH int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_mknod, 0, dir, dentry, mode, dev); } EXPORT_SYMBOL(security_path_mknod); int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_mkdir, 0, dir, dentry, mode); } EXPORT_SYMBOL(security_path_mkdir); int security_path_rmdir(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_rmdir, 0, dir, dentry); } int security_path_unlink(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_unlink, 0, dir, dentry); } EXPORT_SYMBOL(security_path_unlink); int security_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_symlink, 0, dir, dentry, old_name); } int security_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry); } int security_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, unsigned int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; if (flags & RENAME_EXCHANGE) { int err = call_int_hook(path_rename, 0, new_dir, new_dentry, old_dir, old_dentry); if (err) return err; } return call_int_hook(path_rename, 0, old_dir, old_dentry, new_dir, new_dentry); } EXPORT_SYMBOL(security_path_rename); int security_path_truncate(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_truncate, 0, path); } int security_path_chmod(const struct path *path, umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_chmod, 0, path, mode); } int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_chown, 0, path, uid, gid); } int security_path_chroot(const struct path *path) { return call_int_hook(path_chroot, 0, path); } #endif int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_create, 0, dir, dentry, mode); } EXPORT_SYMBOL_GPL(security_inode_create); int security_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return call_int_hook(inode_link, 0, old_dentry, dir, new_dentry); } int security_inode_unlink(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_unlink, 0, dir, dentry); } int security_inode_symlink(struct inode *dir, struct dentry *dentry, const char *old_name) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_symlink, 0, dir, dentry, old_name); } int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_mkdir, 0, dir, dentry, mode); } EXPORT_SYMBOL_GPL(security_inode_mkdir); int security_inode_rmdir(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_rmdir, 0, dir, dentry); } int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_mknod, 0, dir, dentry, mode, dev); } int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; if (flags & RENAME_EXCHANGE) { int err = call_int_hook(inode_rename, 0, new_dir, new_dentry, old_dir, old_dentry); if (err) return err; } return call_int_hook(inode_rename, 0, old_dir, old_dentry, new_dir, new_dentry); } int security_inode_readlink(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_readlink, 0, dentry); } int security_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_follow_link, 0, dentry, inode, rcu); } int security_inode_permission(struct inode *inode, int mask) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_permission, 0, inode, mask); } int security_inode_setattr(struct dentry *dentry, struct iattr *attr) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; ret = call_int_hook(inode_setattr, 0, dentry, attr); if (ret) return ret; return evm_inode_setattr(dentry, attr); } EXPORT_SYMBOL_GPL(security_inode_setattr); int security_inode_getattr(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(inode_getattr, 0, path); } int security_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; /* * SELinux and Smack integrate the cap call, * so assume that all LSMs supplying this call do so. */ ret = call_int_hook(inode_setxattr, 1, dentry, name, value, size, flags); if (ret == 1) ret = cap_inode_setxattr(dentry, name, value, size, flags); if (ret) return ret; ret = ima_inode_setxattr(dentry, name, value, size); if (ret) return ret; return evm_inode_setxattr(dentry, name, value, size); } void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_setxattr, dentry, name, value, size, flags); evm_inode_post_setxattr(dentry, name, value, size); } int security_inode_getxattr(struct dentry *dentry, const char *name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_getxattr, 0, dentry, name); } int security_inode_listxattr(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_listxattr, 0, dentry); } int security_inode_removexattr(struct dentry *dentry, const char *name) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; /* * SELinux and Smack integrate the cap call, * so assume that all LSMs supplying this call do so. */ ret = call_int_hook(inode_removexattr, 1, dentry, name); if (ret == 1) ret = cap_inode_removexattr(dentry, name); if (ret) return ret; ret = ima_inode_removexattr(dentry, name); if (ret) return ret; return evm_inode_removexattr(dentry, name); } int security_inode_need_killpriv(struct dentry *dentry) { return call_int_hook(inode_need_killpriv, 0, dentry); } int security_inode_killpriv(struct dentry *dentry) { return call_int_hook(inode_killpriv, 0, dentry); } int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc) { struct security_hook_list *hp; int rc; if (unlikely(IS_PRIVATE(inode))) return LSM_RET_DEFAULT(inode_getsecurity); /* * Only one module will provide an attribute with a given name. */ hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) { rc = hp->hook.inode_getsecurity(inode, name, buffer, alloc); if (rc != LSM_RET_DEFAULT(inode_getsecurity)) return rc; } return LSM_RET_DEFAULT(inode_getsecurity); } int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct security_hook_list *hp; int rc; if (unlikely(IS_PRIVATE(inode))) return LSM_RET_DEFAULT(inode_setsecurity); /* * Only one module will provide an attribute with a given name. */ hlist_for_each_entry(hp, &security_hook_heads.inode_setsecurity, list) { rc = hp->hook.inode_setsecurity(inode, name, value, size, flags); if (rc != LSM_RET_DEFAULT(inode_setsecurity)) return rc; } return LSM_RET_DEFAULT(inode_setsecurity); } int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_listsecurity, 0, inode, buffer, buffer_size); } EXPORT_SYMBOL(security_inode_listsecurity); void security_inode_getsecid(struct inode *inode, u32 *secid) { call_void_hook(inode_getsecid, inode, secid); } int security_inode_copy_up(struct dentry *src, struct cred **new) { return call_int_hook(inode_copy_up, 0, src, new); } EXPORT_SYMBOL(security_inode_copy_up); int security_inode_copy_up_xattr(const char *name) { struct security_hook_list *hp; int rc; /* * The implementation can return 0 (accept the xattr), 1 (discard the * xattr), -EOPNOTSUPP if it does not know anything about the xattr or * any other error code incase of an error. */ hlist_for_each_entry(hp, &security_hook_heads.inode_copy_up_xattr, list) { rc = hp->hook.inode_copy_up_xattr(name); if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr)) return rc; } return LSM_RET_DEFAULT(inode_copy_up_xattr); } EXPORT_SYMBOL(security_inode_copy_up_xattr); int security_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { return call_int_hook(kernfs_init_security, 0, kn_dir, kn); } int security_file_permission(struct file *file, int mask) { int ret; ret = call_int_hook(file_permission, 0, file, mask); if (ret) return ret; return fsnotify_perm(file, mask); } int security_file_alloc(struct file *file) { int rc = lsm_file_alloc(file); if (rc) return rc; rc = call_int_hook(file_alloc_security, 0, file); if (unlikely(rc)) security_file_free(file); return rc; } void security_file_free(struct file *file) { void *blob; call_void_hook(file_free_security, file); blob = file->f_security; if (blob) { file->f_security = NULL; kmem_cache_free(lsm_file_cache, blob); } } int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl, 0, file, cmd, arg); } EXPORT_SYMBOL_GPL(security_file_ioctl); /** * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode * @file: associated file * @cmd: ioctl cmd * @arg: ioctl arguments * * Compat version of security_file_ioctl() that correctly handles 32-bit * processes running on 64-bit kernels. * * Return: Returns 0 if permission is granted. */ int security_file_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl_compat, 0, file, cmd, arg); } EXPORT_SYMBOL_GPL(security_file_ioctl_compat); static inline unsigned long mmap_prot(struct file *file, unsigned long prot) { /* * Does we have PROT_READ and does the application expect * it to imply PROT_EXEC? If not, nothing to talk about... */ if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ) return prot; if (!(current->personality & READ_IMPLIES_EXEC)) return prot; /* * if that's an anonymous mapping, let it. */ if (!file) return prot | PROT_EXEC; /* * ditto if it's not on noexec mount, except that on !MMU we need * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case */ if (!path_noexec(&file->f_path)) { #ifndef CONFIG_MMU if (file->f_op->mmap_capabilities) { unsigned caps = file->f_op->mmap_capabilities(file); if (!(caps & NOMMU_MAP_EXEC)) return prot; } #endif return prot | PROT_EXEC; } /* anything on noexec mount won't get PROT_EXEC */ return prot; } int security_mmap_file(struct file *file, unsigned long prot, unsigned long flags) { unsigned long prot_adj = mmap_prot(file, prot); int ret; ret = call_int_hook(mmap_file, 0, file, prot, prot_adj, flags); if (ret) return ret; return ima_file_mmap(file, prot, prot_adj, flags); } int security_mmap_addr(unsigned long addr) { return call_int_hook(mmap_addr, 0, addr); } int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { int ret; ret = call_int_hook(file_mprotect, 0, vma, reqprot, prot); if (ret) return ret; return ima_file_mprotect(vma, prot); } int security_file_lock(struct file *file, unsigned int cmd) { return call_int_hook(file_lock, 0, file, cmd); } int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_fcntl, 0, file, cmd, arg); } void security_file_set_fowner(struct file *file) { call_void_hook(file_set_fowner, file); } int security_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int sig) { return call_int_hook(file_send_sigiotask, 0, tsk, fown, sig); } int security_file_receive(struct file *file) { return call_int_hook(file_receive, 0, file); } int security_file_open(struct file *file) { int ret; ret = call_int_hook(file_open, 0, file); if (ret) return ret; return fsnotify_perm(file, MAY_OPEN); } int security_task_alloc(struct task_struct *task, unsigned long clone_flags) { int rc = lsm_task_alloc(task); if (rc) return rc; rc = call_int_hook(task_alloc, 0, task, clone_flags); if (unlikely(rc)) security_task_free(task); return rc; } void security_task_free(struct task_struct *task) { call_void_hook(task_free, task); kfree(task->security); task->security = NULL; } int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) { int rc = lsm_cred_alloc(cred, gfp); if (rc) return rc; rc = call_int_hook(cred_alloc_blank, 0, cred, gfp); if (unlikely(rc)) security_cred_free(cred); return rc; } void security_cred_free(struct cred *cred) { /* * There is a failure case in prepare_creds() that * may result in a call here with ->security being NULL. */ if (unlikely(cred->security == NULL)) return; call_void_hook(cred_free, cred); kfree(cred->security); cred->security = NULL; } int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp) { int rc = lsm_cred_alloc(new, gfp); if (rc) return rc; rc = call_int_hook(cred_prepare, 0, new, old, gfp); if (unlikely(rc)) security_cred_free(new); return rc; } void security_transfer_creds(struct cred *new, const struct cred *old) { call_void_hook(cred_transfer, new, old); } void security_cred_getsecid(const struct cred *c, u32 *secid) { *secid = 0; call_void_hook(cred_getsecid, c, secid); } EXPORT_SYMBOL(security_cred_getsecid); int security_kernel_act_as(struct cred *new, u32 secid) { return call_int_hook(kernel_act_as, 0, new, secid); } int security_kernel_create_files_as(struct cred *new, struct inode *inode) { return call_int_hook(kernel_create_files_as, 0, new, inode); } int security_kernel_module_request(char *kmod_name) { int ret; ret = call_int_hook(kernel_module_request, 0, kmod_name); if (ret) return ret; return integrity_kernel_module_request(kmod_name); } int security_kernel_read_file(struct file *file, enum kernel_read_file_id id, bool contents) { int ret; ret = call_int_hook(kernel_read_file, 0, file, id, contents); if (ret) return ret; return ima_read_file(file, id, contents); } EXPORT_SYMBOL_GPL(security_kernel_read_file); int security_kernel_post_read_file(struct file *file, char *buf, loff_t size, enum kernel_read_file_id id) { int ret; ret = call_int_hook(kernel_post_read_file, 0, file, buf, size, id); if (ret) return ret; return ima_post_read_file(file, buf, size, id); } EXPORT_SYMBOL_GPL(security_kernel_post_read_file); int security_kernel_load_data(enum kernel_load_data_id id, bool contents) { int ret; ret = call_int_hook(kernel_load_data, 0, id, contents); if (ret) return ret; return ima_load_data(id, contents); } EXPORT_SYMBOL_GPL(security_kernel_load_data); int security_kernel_post_load_data(char *buf, loff_t size, enum kernel_load_data_id id, char *description) { int ret; ret = call_int_hook(kernel_post_load_data, 0, buf, size, id, description); if (ret) return ret; return ima_post_load_data(buf, size, id, description); } EXPORT_SYMBOL_GPL(security_kernel_post_load_data); int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { return call_int_hook(task_fix_setuid, 0, new, old, flags); } int security_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { return call_int_hook(task_fix_setgid, 0, new, old, flags); } int security_task_setpgid(struct task_struct *p, pid_t pgid) { return call_int_hook(task_setpgid, 0, p, pgid); } int security_task_getpgid(struct task_struct *p) { return call_int_hook(task_getpgid, 0, p); } int security_task_getsid(struct task_struct *p) { return call_int_hook(task_getsid, 0, p); } void security_task_getsecid(struct task_struct *p, u32 *secid) { *secid = 0; call_void_hook(task_getsecid, p, secid); } EXPORT_SYMBOL(security_task_getsecid); int security_task_setnice(struct task_struct *p, int nice) { return call_int_hook(task_setnice, 0, p, nice); } int security_task_setioprio(struct task_struct *p, int ioprio) { return call_int_hook(task_setioprio, 0, p, ioprio); } int security_task_getioprio(struct task_struct *p) { return call_int_hook(task_getioprio, 0, p); } int security_task_prlimit(const struct cred *cred, const struct cred *tcred, unsigned int flags) { return call_int_hook(task_prlimit, 0, cred, tcred, flags); } int security_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) { return call_int_hook(task_setrlimit, 0, p, resource, new_rlim); } int security_task_setscheduler(struct task_struct *p) { return call_int_hook(task_setscheduler, 0, p); } int security_task_getscheduler(struct task_struct *p) { return call_int_hook(task_getscheduler, 0, p); } int security_task_movememory(struct task_struct *p) { return call_int_hook(task_movememory, 0, p); } int security_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { return call_int_hook(task_kill, 0, p, info, sig, cred); } int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int thisrc; int rc = LSM_RET_DEFAULT(task_prctl); struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.task_prctl, list) { thisrc = hp->hook.task_prctl(option, arg2, arg3, arg4, arg5); if (thisrc != LSM_RET_DEFAULT(task_prctl)) { rc = thisrc; if (thisrc != 0) break; } } return rc; } void security_task_to_inode(struct task_struct *p, struct inode *inode) { call_void_hook(task_to_inode, p, inode); } int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { return call_int_hook(ipc_permission, 0, ipcp, flag); } void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid) { *secid = 0; call_void_hook(ipc_getsecid, ipcp, secid); } int security_msg_msg_alloc(struct msg_msg *msg) { int rc = lsm_msg_msg_alloc(msg); if (unlikely(rc)) return rc; rc = call_int_hook(msg_msg_alloc_security, 0, msg); if (unlikely(rc)) security_msg_msg_free(msg); return rc; } void security_msg_msg_free(struct msg_msg *msg) { call_void_hook(msg_msg_free_security, msg); kfree(msg->security); msg->security = NULL; } int security_msg_queue_alloc(struct kern_ipc_perm *msq) { int rc = lsm_ipc_alloc(msq); if (unlikely(rc)) return rc; rc = call_int_hook(msg_queue_alloc_security, 0, msq); if (unlikely(rc)) security_msg_queue_free(msq); return rc; } void security_msg_queue_free(struct kern_ipc_perm *msq) { call_void_hook(msg_queue_free_security, msq); kfree(msq->security); msq->security = NULL; } int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) { return call_int_hook(msg_queue_associate, 0, msq, msqflg); } int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) { return call_int_hook(msg_queue_msgctl, 0, msq, cmd); } int security_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg) { return call_int_hook(msg_queue_msgsnd, 0, msq, msg, msqflg); } int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg, struct task_struct *target, long type, int mode) { return call_int_hook(msg_queue_msgrcv, 0, msq, msg, target, type, mode); } int security_shm_alloc(struct kern_ipc_perm *shp) { int rc = lsm_ipc_alloc(shp); if (unlikely(rc)) return rc; rc = call_int_hook(shm_alloc_security, 0, shp); if (unlikely(rc)) security_shm_free(shp); return rc; } void security_shm_free(struct kern_ipc_perm *shp) { call_void_hook(shm_free_security, shp); kfree(shp->security); shp->security = NULL; } int security_shm_associate(struct kern_ipc_perm *shp, int shmflg) { return call_int_hook(shm_associate, 0, shp, shmflg); } int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd) { return call_int_hook(shm_shmctl, 0, shp, cmd); } int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg) { return call_int_hook(shm_shmat, 0, shp, shmaddr, shmflg); } int security_sem_alloc(struct kern_ipc_perm *sma) { int rc = lsm_ipc_alloc(sma); if (unlikely(rc)) return rc; rc = call_int_hook(sem_alloc_security, 0, sma); if (unlikely(rc)) security_sem_free(sma); return rc; } void security_sem_free(struct kern_ipc_perm *sma) { call_void_hook(sem_free_security, sma); kfree(sma->security); sma->security = NULL; } int security_sem_associate(struct kern_ipc_perm *sma, int semflg) { return call_int_hook(sem_associate, 0, sma, semflg); } int security_sem_semctl(struct kern_ipc_perm *sma, int cmd) { return call_int_hook(sem_semctl, 0, sma, cmd); } int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter) { return call_int_hook(sem_semop, 0, sma, sops, nsops, alter); } void security_d_instantiate(struct dentry *dentry, struct inode *inode) { if (unlikely(inode && IS_PRIVATE(inode))) return; call_void_hook(d_instantiate, dentry, inode); } EXPORT_SYMBOL(security_d_instantiate); int security_getprocattr(struct task_struct *p, const char *lsm, char *name, char **value) { struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) { if (lsm != NULL && strcmp(lsm, hp->lsm)) continue; return hp->hook.getprocattr(p, name, value); } return LSM_RET_DEFAULT(getprocattr); } int security_setprocattr(const char *lsm, const char *name, void *value, size_t size) { struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) { if (lsm != NULL && strcmp(lsm, hp->lsm)) continue; return hp->hook.setprocattr(name, value, size); } return LSM_RET_DEFAULT(setprocattr); } int security_netlink_send(struct sock *sk, struct sk_buff *skb) { return call_int_hook(netlink_send, 0, sk, skb); } int security_ismaclabel(const char *name) { return call_int_hook(ismaclabel, 0, name); } EXPORT_SYMBOL(security_ismaclabel); int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { struct security_hook_list *hp; int rc; /* * Currently, only one LSM can implement secid_to_secctx (i.e this * LSM hook is not "stackable"). */ hlist_for_each_entry(hp, &security_hook_heads.secid_to_secctx, list) { rc = hp->hook.secid_to_secctx(secid, secdata, seclen); if (rc != LSM_RET_DEFAULT(secid_to_secctx)) return rc; } return LSM_RET_DEFAULT(secid_to_secctx); } EXPORT_SYMBOL(security_secid_to_secctx); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { *secid = 0; return call_int_hook(secctx_to_secid, 0, secdata, seclen, secid); } EXPORT_SYMBOL(security_secctx_to_secid); void security_release_secctx(char *secdata, u32 seclen) { call_void_hook(release_secctx, secdata, seclen); } EXPORT_SYMBOL(security_release_secctx); void security_inode_invalidate_secctx(struct inode *inode) { call_void_hook(inode_invalidate_secctx, inode); } EXPORT_SYMBOL(security_inode_invalidate_secctx); int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) { return call_int_hook(inode_notifysecctx, 0, inode, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_notifysecctx); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { return call_int_hook(inode_setsecctx, 0, dentry, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_setsecctx); int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) { struct security_hook_list *hp; int rc; /* * Only one module will provide a security context. */ hlist_for_each_entry(hp, &security_hook_heads.inode_getsecctx, list) { rc = hp->hook.inode_getsecctx(inode, ctx, ctxlen); if (rc != LSM_RET_DEFAULT(inode_getsecctx)) return rc; } return LSM_RET_DEFAULT(inode_getsecctx); } EXPORT_SYMBOL(security_inode_getsecctx); #ifdef CONFIG_WATCH_QUEUE int security_post_notification(const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) { return call_int_hook(post_notification, 0, w_cred, cred, n); } #endif /* CONFIG_WATCH_QUEUE */ #ifdef CONFIG_KEY_NOTIFICATIONS int security_watch_key(struct key *key) { return call_int_hook(watch_key, 0, key); } #endif #ifdef CONFIG_SECURITY_NETWORK int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) { return call_int_hook(unix_stream_connect, 0, sock, other, newsk); } EXPORT_SYMBOL(security_unix_stream_connect); int security_unix_may_send(struct socket *sock, struct socket *other) { return call_int_hook(unix_may_send, 0, sock, other); } EXPORT_SYMBOL(security_unix_may_send); int security_socket_create(int family, int type, int protocol, int kern) { return call_int_hook(socket_create, 0, family, type, protocol, kern); } int security_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { return call_int_hook(socket_post_create, 0, sock, family, type, protocol, kern); } int security_socket_socketpair(struct socket *socka, struct socket *sockb) { return call_int_hook(socket_socketpair, 0, socka, sockb); } EXPORT_SYMBOL(security_socket_socketpair); int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { return call_int_hook(socket_bind, 0, sock, address, addrlen); } int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { return call_int_hook(socket_connect, 0, sock, address, addrlen); } int security_socket_listen(struct socket *sock, int backlog) { return call_int_hook(socket_listen, 0, sock, backlog); } int security_socket_accept(struct socket *sock, struct socket *newsock) { return call_int_hook(socket_accept, 0, sock, newsock); } int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return call_int_hook(socket_sendmsg, 0, sock, msg, size); } int security_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return call_int_hook(socket_recvmsg, 0, sock, msg, size, flags); } int security_socket_getsockname(struct socket *sock) { return call_int_hook(socket_getsockname, 0, sock); } int security_socket_getpeername(struct socket *sock) { return call_int_hook(socket_getpeername, 0, sock); } int security_socket_getsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_getsockopt, 0, sock, level, optname); } int security_socket_setsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_setsockopt, 0, sock, level, optname); } int security_socket_shutdown(struct socket *sock, int how) { return call_int_hook(socket_shutdown, 0, sock, how); } int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { return call_int_hook(socket_sock_rcv_skb, 0, sk, skb); } EXPORT_SYMBOL(security_sock_rcv_skb); int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { struct security_hook_list *hp; int rc; /* * Only one module will provide a security context. */ hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_stream, list) { rc = hp->hook.socket_getpeersec_stream(sock, optval, optlen, len); if (rc != LSM_RET_DEFAULT(socket_getpeersec_stream)) return rc; } return LSM_RET_DEFAULT(socket_getpeersec_stream); } int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { struct security_hook_list *hp; int rc; /* * Only one module will provide a security context. */ hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_dgram, list) { rc = hp->hook.socket_getpeersec_dgram(sock, skb, secid); if (rc != LSM_RET_DEFAULT(socket_getpeersec_dgram)) return rc; } return LSM_RET_DEFAULT(socket_getpeersec_dgram); } EXPORT_SYMBOL(security_socket_getpeersec_dgram); int security_sk_alloc(struct sock *sk, int family, gfp_t priority) { return call_int_hook(sk_alloc_security, 0, sk, family, priority); } void security_sk_free(struct sock *sk) { call_void_hook(sk_free_security, sk); } void security_sk_clone(const struct sock *sk, struct sock *newsk) { call_void_hook(sk_clone_security, sk, newsk); } EXPORT_SYMBOL(security_sk_clone); void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic) { call_void_hook(sk_getsecid, sk, &flic->flowic_secid); } EXPORT_SYMBOL(security_sk_classify_flow); void security_req_classify_flow(const struct request_sock *req, struct flowi_common *flic) { call_void_hook(req_classify_flow, req, flic); } EXPORT_SYMBOL(security_req_classify_flow); void security_sock_graft(struct sock *sk, struct socket *parent) { call_void_hook(sock_graft, sk, parent); } EXPORT_SYMBOL(security_sock_graft); int security_inet_conn_request(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { return call_int_hook(inet_conn_request, 0, sk, skb, req); } EXPORT_SYMBOL(security_inet_conn_request); void security_inet_csk_clone(struct sock *newsk, const struct request_sock *req) { call_void_hook(inet_csk_clone, newsk, req); } void security_inet_conn_established(struct sock *sk, struct sk_buff *skb) { call_void_hook(inet_conn_established, sk, skb); } EXPORT_SYMBOL(security_inet_conn_established); int security_secmark_relabel_packet(u32 secid) { return call_int_hook(secmark_relabel_packet, 0, secid); } EXPORT_SYMBOL(security_secmark_relabel_packet); void security_secmark_refcount_inc(void) { call_void_hook(secmark_refcount_inc); } EXPORT_SYMBOL(security_secmark_refcount_inc); void security_secmark_refcount_dec(void) { call_void_hook(secmark_refcount_dec); } EXPORT_SYMBOL(security_secmark_refcount_dec); int security_tun_dev_alloc_security(void **security) { return call_int_hook(tun_dev_alloc_security, 0, security); } EXPORT_SYMBOL(security_tun_dev_alloc_security); void security_tun_dev_free_security(void *security) { call_void_hook(tun_dev_free_security, security); } EXPORT_SYMBOL(security_tun_dev_free_security); int security_tun_dev_create(void) { return call_int_hook(tun_dev_create, 0); } EXPORT_SYMBOL(security_tun_dev_create); int security_tun_dev_attach_queue(void *security) { return call_int_hook(tun_dev_attach_queue, 0, security); } EXPORT_SYMBOL(security_tun_dev_attach_queue); int security_tun_dev_attach(struct sock *sk, void *security) { return call_int_hook(tun_dev_attach, 0, sk, security); } EXPORT_SYMBOL(security_tun_dev_attach); int security_tun_dev_open(void *security) { return call_int_hook(tun_dev_open, 0, security); } EXPORT_SYMBOL(security_tun_dev_open); int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb) { return call_int_hook(sctp_assoc_request, 0, ep, skb); } EXPORT_SYMBOL(security_sctp_assoc_request); int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen) { return call_int_hook(sctp_bind_connect, 0, sk, optname, address, addrlen); } EXPORT_SYMBOL(security_sctp_bind_connect); void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk, struct sock *newsk) { call_void_hook(sctp_sk_clone, ep, sk, newsk); } EXPORT_SYMBOL(security_sctp_sk_clone); #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey) { return call_int_hook(ib_pkey_access, 0, sec, subnet_prefix, pkey); } EXPORT_SYMBOL(security_ib_pkey_access); int security_ib_endport_manage_subnet(void *sec, const char *dev_name, u8 port_num) { return call_int_hook(ib_endport_manage_subnet, 0, sec, dev_name, port_num); } EXPORT_SYMBOL(security_ib_endport_manage_subnet); int security_ib_alloc_security(void **sec) { return call_int_hook(ib_alloc_security, 0, sec); } EXPORT_SYMBOL(security_ib_alloc_security); void security_ib_free_security(void *sec) { call_void_hook(ib_free_security, sec); } EXPORT_SYMBOL(security_ib_free_security); #endif /* CONFIG_SECURITY_INFINIBAND */ #ifdef CONFIG_SECURITY_NETWORK_XFRM int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp) { return call_int_hook(xfrm_policy_alloc_security, 0, ctxp, sec_ctx, gfp); } EXPORT_SYMBOL(security_xfrm_policy_alloc); int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctxp) { return call_int_hook(xfrm_policy_clone_security, 0, old_ctx, new_ctxp); } void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx) { call_void_hook(xfrm_policy_free_security, ctx); } EXPORT_SYMBOL(security_xfrm_policy_free); int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx) { return call_int_hook(xfrm_policy_delete_security, 0, ctx); } int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) { return call_int_hook(xfrm_state_alloc, 0, x, sec_ctx); } EXPORT_SYMBOL(security_xfrm_state_alloc); int security_xfrm_state_alloc_acquire(struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid) { return call_int_hook(xfrm_state_alloc_acquire, 0, x, polsec, secid); } int security_xfrm_state_delete(struct xfrm_state *x) { return call_int_hook(xfrm_state_delete_security, 0, x); } EXPORT_SYMBOL(security_xfrm_state_delete); void security_xfrm_state_free(struct xfrm_state *x) { call_void_hook(xfrm_state_free_security, x); } int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir) { return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid, dir); } int security_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic) { struct security_hook_list *hp; int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match); /* * Since this function is expected to return 0 or 1, the judgment * becomes difficult if multiple LSMs supply this call. Fortunately, * we can use the first LSM's judgment because currently only SELinux * supplies this call. * * For speed optimization, we explicitly break the loop rather than * using the macro */ hlist_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match, list) { rc = hp->hook.xfrm_state_pol_flow_match(x, xp, flic); break; } return rc; } int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid) { return call_int_hook(xfrm_decode_session, 0, skb, secid, 1); } void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic) { int rc = call_int_hook(xfrm_decode_session, 0, skb, &flic->flowic_secid, 0); BUG_ON(rc); } EXPORT_SYMBOL(security_skb_classify_flow); #endif /* CONFIG_SECURITY_NETWORK_XFRM */ #ifdef CONFIG_KEYS int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags) { return call_int_hook(key_alloc, 0, key, cred, flags); } void security_key_free(struct key *key) { call_void_hook(key_free, key); } int security_key_permission(key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { return call_int_hook(key_permission, 0, key_ref, cred, need_perm); } int security_key_getsecurity(struct key *key, char **_buffer) { *_buffer = NULL; return call_int_hook(key_getsecurity, 0, key, _buffer); } #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule) { return call_int_hook(audit_rule_init, 0, field, op, rulestr, lsmrule); } int security_audit_rule_known(struct audit_krule *krule) { return call_int_hook(audit_rule_known, 0, krule); } void security_audit_rule_free(void *lsmrule) { call_void_hook(audit_rule_free, lsmrule); } int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule) { return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule); } #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) { return call_int_hook(bpf, 0, cmd, attr, size); } int security_bpf_map(struct bpf_map *map, fmode_t fmode) { return call_int_hook(bpf_map, 0, map, fmode); } int security_bpf_prog(struct bpf_prog *prog) { return call_int_hook(bpf_prog, 0, prog); } int security_bpf_map_alloc(struct bpf_map *map) { return call_int_hook(bpf_map_alloc_security, 0, map); } int security_bpf_prog_alloc(struct bpf_prog_aux *aux) { return call_int_hook(bpf_prog_alloc_security, 0, aux); } void security_bpf_map_free(struct bpf_map *map) { call_void_hook(bpf_map_free_security, map); } void security_bpf_prog_free(struct bpf_prog_aux *aux) { call_void_hook(bpf_prog_free_security, aux); } #endif /* CONFIG_BPF_SYSCALL */ int security_locked_down(enum lockdown_reason what) { return call_int_hook(locked_down, 0, what); } EXPORT_SYMBOL(security_locked_down); #ifdef CONFIG_PERF_EVENTS int security_perf_event_open(struct perf_event_attr *attr, int type) { return call_int_hook(perf_event_open, 0, attr, type); } int security_perf_event_alloc(struct perf_event *event) { return call_int_hook(perf_event_alloc, 0, event); } void security_perf_event_free(struct perf_event *event) { call_void_hook(perf_event_free, event); } int security_perf_event_read(struct perf_event *event) { return call_int_hook(perf_event_read, 0, event); } int security_perf_event_write(struct perf_event *event) { return call_int_hook(perf_event_write, 0, event); } #endif /* CONFIG_PERF_EVENTS */
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/exec.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * #!-checking implemented by tytso. */ /* * Demand-loading implemented 01.12.91 - no need to read anything but * the header into memory. The inode of the executable is put into * "current->executable", and page faults do the actual loading. Clean. * * Once more I can proudly say that linux stood up to being changed: it * was less than 2 hours work to get demand-loading completely implemented. * * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary * formats. */ #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/vmacache.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/tracehook.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include <trace/events/task.h> #include "internal.h" #include <trace/events/sched.h> static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { BUG_ON(!fmt); if (WARN_ON(!fmt->load_binary)) return; write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(__register_binfmt); void unregister_binfmt(struct linux_binfmt * fmt) { write_lock(&binfmt_lock); list_del(&fmt->lh); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(unregister_binfmt); static inline void put_binfmt(struct linux_binfmt * fmt) { module_put(fmt->module); } bool path_noexec(const struct path *path) { return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } #ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to * security reasons. * * Also note that we take the address to load from from the file itself. */ SYSCALL_DEFINE1(uselib, const char __user *, library) { struct linux_binfmt *fmt; struct file *file; struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if (IS_ERR(tmp)) goto out; file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) goto out; /* * may_open() has already checked for this, so it should be * impossible to trip now. But we need to be extra cautious * and check again at the very end too. */ error = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || path_noexec(&file->f_path))) goto exit; fsnotify_open(file); error = -ENOEXEC; read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!fmt->load_shlib) continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); error = fmt->load_shlib(file); read_lock(&binfmt_lock); put_binfmt(fmt); if (error != -ENOEXEC) break; } read_unlock(&binfmt_lock); exit: fput(file); out: return error; } #endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can * use a lot of memory, account these pages in current->mm temporary * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we * change the counter back via acct_arg_size(0). */ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); if (!mm || !diff) return; bprm->vma_pages = pages; add_mm_counter(mm, MM_ANONPAGES, diff); } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; int ret; unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { ret = expand_downwards(bprm->vma, pos); if (ret < 0) return NULL; } #endif if (write) gup_flags |= FOLL_WRITE; /* * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, &page, NULL, NULL); if (ret <= 0) return NULL; if (write) acct_arg_size(bprm, vma_pages(bprm->vma)); return page; } static void put_arg_page(struct page *page) { put_page(page); } static void free_arg_pages(struct linux_binprm *bprm) { } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } static int __bprm_mm_init(struct linux_binprm *bprm) { int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; vma_set_anonymous(vma); if (mmap_write_lock_killable(mm)) { err = -EINTR; goto err_free; } /* * Place the stack at the largest stack address the architecture * supports. Later, we'll move this to an appropriate place. We don't * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); if (err) goto err; mm->stack_vm = mm->total_vm = 1; mmap_write_unlock(mm); bprm->p = vma->vm_end - sizeof(void *); return 0; err: mmap_write_unlock(mm); err_free: bprm->vma = NULL; vm_area_free(vma); return err; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; } #else static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; page = bprm->page[pos / PAGE_SIZE]; if (!page && write) { page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); if (!page) return NULL; bprm->page[pos / PAGE_SIZE] = page; } return page; } static void put_arg_page(struct page *page) { } static void free_arg_page(struct linux_binprm *bprm, int i) { if (bprm->page[i]) { __free_page(bprm->page[i]); bprm->page[i] = NULL; } } static void free_arg_pages(struct linux_binprm *bprm) { int i; for (i = 0; i < MAX_ARG_PAGES; i++) free_arg_page(bprm, i); } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { } static int __bprm_mm_init(struct linux_binprm *bprm) { bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); return 0; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; } #endif /* CONFIG_MMU */ /* * Create a new mm_struct and populate it with a temporary stack * vm_area_struct. We don't have enough context at this point to set the stack * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages(). */ static int bprm_mm_init(struct linux_binprm *bprm) { int err; struct mm_struct *mm = NULL; bprm->mm = mm = mm_alloc(); err = -ENOMEM; if (!mm) goto err; /* Save current stack limit for all calculations made during exec. */ task_lock(current->group_leader); bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); err = __bprm_mm_init(bprm); if (err) goto err; return 0; err: if (mm) { bprm->mm = NULL; mmdrop(mm); } return err; } struct user_arg_ptr { #ifdef CONFIG_COMPAT bool is_compat; #endif union { const char __user *const __user *native; #ifdef CONFIG_COMPAT const compat_uptr_t __user *compat; #endif } ptr; }; static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) { const char __user *native; #ifdef CONFIG_COMPAT if (unlikely(argv.is_compat)) { compat_uptr_t compat; if (get_user(compat, argv.ptr.compat + nr)) return ERR_PTR(-EFAULT); return compat_ptr(compat); } #endif if (get_user(native, argv.ptr.native + nr)) return ERR_PTR(-EFAULT); return native; } /* * count() counts the number of strings in array ARGV. */ static int count(struct user_arg_ptr argv, int max) { int i = 0; if (argv.ptr.native != NULL) { for (;;) { const char __user *p = get_user_arg_ptr(argv, i); if (!p) break; if (IS_ERR(p)) return -EFAULT; if (i >= max) return -E2BIG; ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } } return i; } static int count_strings_kernel(const char *const *argv) { int i; if (!argv) return 0; for (i = 0; argv[i]; ++i) { if (i >= MAX_ARG_STRINGS) return -E2BIG; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return i; } static int bprm_stack_limits(struct linux_binprm *bprm) { unsigned long limit, ptr_size; /* * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ limit = _STK_LIM / 4 * 3; limit = min(limit, bprm->rlim_stack.rlim_cur / 4); /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks */ limit = max_t(unsigned long, limit, ARG_MAX); /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. * * In the case of argc = 0, make sure there is space for adding a * empty string (which will bump argc to 1), to ensure confused * userspace programs don't start processing from argv[1], thinking * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common(). */ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *); if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; bprm->argmin = bprm->p - limit; return 0; } /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; char *kaddr = NULL; unsigned long kpos = 0; int ret; while (argc-- > 0) { const char __user *str; int len; unsigned long pos; ret = -EFAULT; str = get_user_arg_ptr(argv, argc); if (IS_ERR(str)) goto out; len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out; ret = -E2BIG; if (!valid_arg_len(bprm, len)) goto out; /* We're going to work our way backwords. */ pos = bprm->p; str += len; bprm->p -= len; #ifdef CONFIG_MMU if (bprm->p < bprm->argmin) goto out; #endif while (len > 0) { int offset, bytes_to_copy; if (fatal_signal_pending(current)) { ret = -ERESTARTNOHAND; goto out; } cond_resched(); offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; bytes_to_copy = offset; if (bytes_to_copy > len) bytes_to_copy = len; offset -= bytes_to_copy; pos -= bytes_to_copy; str -= bytes_to_copy; len -= bytes_to_copy; if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; page = get_arg_page(bprm, pos, 1); if (!page) { ret = -E2BIG; goto out; } if (kmapped_page) { flush_kernel_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } kmapped_page = page; kaddr = kmap(kmapped_page); kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; } } } ret = 0; out: if (kmapped_page) { flush_kernel_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } return ret; } /* * Copy and argument/environment string from the kernel to the processes stack. */ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; unsigned long pos = bprm->p; if (len == 0) return -EFAULT; if (!valid_arg_len(bprm, len)) return -E2BIG; /* We're going to work our way backwards. */ arg += len; bprm->p -= len; if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin) return -E2BIG; while (len > 0) { unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; char *kaddr; pos -= bytes_to_copy; arg -= bytes_to_copy; len -= bytes_to_copy; page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; kaddr = kmap_atomic(page); flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); flush_kernel_dcache_page(page); kunmap_atomic(kaddr); put_arg_page(page); } return 0; } EXPORT_SYMBOL(copy_string_kernel); static int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm) { while (argc-- > 0) { int ret = copy_string_kernel(argv[argc], bprm); if (ret < 0) return ret; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return 0; } #ifdef CONFIG_MMU /* * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once * the binfmt code determines where the new stack should reside, we shift it to * its final location. The process proceeds as follows: * * 1) Use shift to calculate the new vma endpoints. * 2) Extend vma to cover both the old and new ranges. This ensures the * arguments passed to subsequent functions are consistent. * 3) Move vma's page tables to the new range. * 4) Free up any cleared pgd range. * 5) Shrink the vma to cover only the new range. */ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) { struct mm_struct *mm = vma->vm_mm; unsigned long old_start = vma->vm_start; unsigned long old_end = vma->vm_end; unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; struct mmu_gather tlb; BUG_ON(new_start > new_end); /* * ensure there are no vmas between where we want to go * and where we are */ if (vma != find_vma(mm, new_start)) return -EFAULT; /* * cover the whole range: [new_start, old_end) */ if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* * move the page tables downwards, on failure we rely on * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, vma, new_start, length, false)) return -ENOMEM; lru_add_drain(); tlb_gather_mmu(&tlb, mm, old_start, old_end); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch * the address space in [new_end, old_start) some architectures * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb, old_start, old_end); /* * Shrink the vma to just the new range. Always succeeds. */ vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); return 0; } /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. */ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { unsigned long ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = bprm->rlim_stack.rlim_max; if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; /* Add space for stack randomization. */ stack_base += (STACK_RND_MASK << PAGE_SHIFT); /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; stack_base = PAGE_ALIGN(stack_top - stack_base); stack_shift = vma->vm_start - stack_base; mm->arg_start = bprm->p - stack_shift; bprm->p = vma->vm_end - stack_shift; #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; mm->arg_start = bprm->p; #endif if (bprm->loader) bprm->loader -= stack_shift; bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) return -EINTR; vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone * (arch default) otherwise. */ if (unlikely(executable_stack == EXSTACK_ENABLE_X)) vm_flags |= VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags); if (ret) goto out_unlock; BUG_ON(prev != vma); if (unlikely(vm_flags & VM_EXEC)) { pr_warn_once("process '%pD4' started with executable stack\n", bprm->file); } /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); if (ret) goto out_unlock; } /* mprotect_fixup is overkill to remove the temporary stack flags */ vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; #ifdef CONFIG_STACK_GROWSUP if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_start + rlim_stack; else stack_base = vma->vm_end + stack_expand; #else if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_end - rlim_stack; else stack_base = vma->vm_start - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack(vma, stack_base); if (ret) ret = -EFAULT; out_unlock: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(setup_arg_pages); #else /* * Transfer the program arguments and environment from the holding pages * onto the stack. The provided stack pointer is adjusted accordingly. */ int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location) { unsigned long index, stop, sp; int ret = 0; stop = bprm->p >> PAGE_SHIFT; sp = *sp_location; for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; char *src = kmap(bprm->page[index]) + offset; sp -= PAGE_SIZE - offset; if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) ret = -EFAULT; kunmap(bprm->page[index]); if (ret) goto out; } bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; *sp_location = sp; out: return ret; } EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; int err; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return ERR_PTR(-EINVAL); if (flags & AT_SYMLINK_NOFOLLOW) open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) open_exec_flags.lookup_flags |= LOOKUP_EMPTY; file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) goto out; /* * may_open() has already checked for this, so it should be * impossible to trip now. But we need to be extra cautious * and check again at the very end too. */ err = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || path_noexec(&file->f_path))) goto exit; err = deny_write_access(file); if (err) goto exit; if (name->name[0] != '\0') fsnotify_open(file); out: return file; exit: fput(file); return ERR_PTR(err); } struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); struct file *f = ERR_CAST(filename); if (!IS_ERR(filename)) { f = do_open_execat(AT_FDCWD, filename, 0); putname(filename); } return f; } EXPORT_SYMBOL(open_exec); #if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \ defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); #endif /* * Maps the mm_struct mm into the current task struct. * On success, this function returns with exec_update_lock * held for writing. */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); if (old_mm) sync_mm_rss(old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; if (old_mm) { /* * Make sure that if there is a core dump in progress * for the old mm, we get out and die instead of going * through with the exec. We must hold mmap_lock around * checking core_state and changing tsk->mm. */ mmap_read_lock(old_mm); if (unlikely(old_mm->core_state)) { mmap_read_unlock(old_mm); up_write(&tsk->signal->exec_update_lock); return -EINTR; } } task_lock(tsk); membarrier_exec_mmap(mm); local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for * lazy tlb mm refcounting when these are updated by context * switches. Not all architectures can handle irqs off over * activate_mm yet. */ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); tsk->mm->vmacache_seqnum = 0; vmacache_flush(tsk); task_unlock(tsk); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); return 0; } mmdrop(active_mm); return 0; } static int de_thread(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; if (thread_group_empty(tsk)) goto no_thread_group; /* * Kill all other threads in the thread group. */ spin_lock_irq(lock); if (signal_group_exit(sig)) { /* * Another group action in progress, just * return so that the signal is processed. */ spin_unlock_irq(lock); return -EAGAIN; } sig->group_exit_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); if (__fatal_signal_pending(tsk)) goto killed; spin_lock_irq(lock); } spin_unlock_irq(lock); /* * At this point all other threads have exited, all we have to * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader; for (;;) { cgroup_threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that * exit_notify() can't miss ->group_exit_task */ sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); schedule(); if (__fatal_signal_pending(tsk)) goto killed; } /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. * All the past CPU time is accumulated in signal_struct * from sister threads now dead. But in this non-leader * exec, nothing survives from the original leader thread, * whose birth marks the true age of this process now. * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; tsk->start_boottime = leader->start_boottime; BUG_ON(!same_thread_group(leader, tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the * two threads with a switched PID, and release * the former thread group leader: */ /* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group. */ exchange_tids(tsk, leader); transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; tsk->exit_signal = SIGCHLD; leader->exit_signal = -1; BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer wont't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); release_task(leader); } sig->group_exit_task = NULL; sig->notify_count = 0; no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; BUG_ON(!thread_group_leader(tsk)); return 0; killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exit_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; } /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal * table via the CLONE_SIGHAND option to clone().) */ static int unshare_sighand(struct task_struct *me) { struct sighand_struct *oldsighand = me->sighand; if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND * but not CLONE_THREAD task, switch to the new one. */ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); if (!newsighand) return -ENOMEM; refcount_set(&newsighand->count, 1); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } return 0; } char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { task_lock(tsk); strncpy(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } EXPORT_SYMBOL_GPL(__get_task_comm); /* * These functions flushes out all traces of the currently running executable * so that a new one can be started */ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk, exec); } /* * Calling this is the point of no return. None of the failures will be * seen by userspace since either the process is already taking a fatal * signal (via de_thread() or coredump), or will have SEGV raised * (after exec_mmap()) by search_binary_handler (see below). */ int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; int retval; /* Once we are committed compute the creds */ retval = bprm_creds_from_file(bprm); if (retval) return retval; /* * Ensure all future errors are fatal. */ bprm->point_of_no_return = true; /* * Make this the only thread in the thread group. */ retval = de_thread(me); if (retval) goto out; /* * Must be called _before_ exec_mmap() as bprm->mm is * not visibile until then. This also enables the update * to be lockless. */ set_mm_exe_file(bprm->mm, bprm->file); /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); if (bprm->have_execfd) would_dump(bprm, bprm->executable); /* * Release all of the old mmap stuff */ acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; bprm->mm = NULL; #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); spin_unlock_irq(&me->sighand->siglock); exit_itimers(me); flush_itimer_signals(); #endif /* * Make the signal table private. */ retval = unshare_sighand(me); if (retval) goto out_unlock; /* * Ensure that the uaccess routines can actually operate on userspace * pointers: */ force_uaccess_begin(); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ do_close_on_exec(me->files); if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ me->pdeath_signal = 0; /* * For secureexec, reset the stack limit to sane default to * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ if (bprm->rlim_stack.rlim_cur > _STK_LIM) bprm->rlim_stack.rlim_cur = _STK_LIM; } me->sas_ss_sp = me->sas_ss_size = 0; /* * Figure out dumpability. Note that this checking only of current * is wrong, but userspace depends on it. This should be testing * bprm->secureexec instead. */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || !(uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); __set_task_comm(me, kbasename(bprm->filename), true); /* An exec changes our domain. We are no longer part of the thread group */ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); /* * install the new credentials for this executable */ security_bprm_committing_creds(bprm); commit_creds(bprm->cred); bprm->cred = NULL; /* * Disable monitoring for regular users * when executing setuid binaries. Must * wait until new credentials are committed * by commit_creds() above */ if (get_dumpable(me->mm) != SUID_DUMP_USER) perf_event_exit_task(me); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); /* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) { retval = get_unused_fd_flags(0); if (retval < 0) goto out_unlock; fd_install(retval, bprm->executable); bprm->executable = NULL; bprm->execfd = retval; } return 0; out_unlock: up_write(&me->signal->exec_update_lock); if (!bprm->cred) mutex_unlock(&me->signal->cred_guard_mutex); out: return retval; } EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); if (inode_permission(inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && !privileged_wrt_inode_uidgid(user_ns, inode)) user_ns = user_ns->parent; if (old != user_ns) { bprm->mm->user_ns = get_user_ns(user_ns); put_user_ns(old); } } } EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { /* Setup things that can depend upon the personality */ struct task_struct *me = current; arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); arch_setup_new_exec(); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); /* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm) { /* Store any stack rlimit changes before starting thread. */ task_lock(current->group_leader); current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred and * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) { if (mutex_lock_interruptible(&current->signal->cred_guard_mutex)) return -ERESTARTNOINTR; bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; mutex_unlock(&current->signal->cred_guard_mutex); return -ENOMEM; } static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { acct_arg_size(bprm, 0); mmput(bprm->mm); } free_arg_pages(bprm); if (bprm->cred) { mutex_unlock(&current->signal->cred_guard_mutex); abort_creds(bprm->cred); } if (bprm->file) { allow_write_access(bprm->file); fput(bprm->file); } if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); kfree(bprm->fdpath); kfree(bprm); } static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) { struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); int retval = -ENOMEM; if (!bprm) goto out; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); else bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); if (!bprm->fdpath) goto out_free; bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); if (retval) goto out_free; return bprm; out_free: free_bprm(bprm); out: return ERR_PTR(retval); } int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); bprm->interp = kstrdup(interp, GFP_KERNEL); if (!bprm->interp) return -ENOMEM; return 0; } EXPORT_SYMBOL(bprm_change_interp); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; if (p->ptrace) bprm->unsafe |= LSM_UNSAFE_PTRACE; /* * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); while_each_thread(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; spin_unlock(&p->fs->lock); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct inode *inode; unsigned int mode; kuid_t uid; kgid_t gid; if (!mnt_may_suid(file->f_path.mnt)) return; if (task_no_new_privs(current)) return; inode = file->f_path.dentry->d_inode; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; /* Be careful if suid/sgid is set */ inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; uid = inode->i_uid; gid = inode->i_gid; inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!kuid_has_mapping(bprm->cred->user_ns, uid) || !kgid_has_mapping(bprm->cred->user_ns, gid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = uid; } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = gid; } } /* * Compute brpm->cred based upon the final binary. */ static int bprm_creds_from_file(struct linux_binprm *bprm) { /* Compute creds based on which file? */ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; bprm_fill_uid(bprm, file); return security_bprm_creds_from_file(bprm, file); } /* * Fill the binprm structure from the inode. * Read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ static int prepare_binprm(struct linux_binprm *bprm) { loff_t pos = 0; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after * the first '\0' encountered. */ int remove_arg_zero(struct linux_binprm *bprm) { int ret = 0; unsigned long offset; char *kaddr; struct page *page; if (!bprm->argc) return 0; do { offset = bprm->p & ~PAGE_MASK; page = get_arg_page(bprm, bprm->p, 0); if (!page) { ret = -EFAULT; goto out; } kaddr = kmap_atomic(page); for (; offset < PAGE_SIZE && kaddr[offset]; offset++, bprm->p++) ; kunmap_atomic(kaddr); put_arg_page(page); } while (offset == PAGE_SIZE); bprm->p++; bprm->argc--; ret = 0; out: return ret; } EXPORT_SYMBOL(remove_arg_zero); #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) /* * cycle the list of binary formats handler, until one recognizes the image */ static int search_binary_handler(struct linux_binprm *bprm) { bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; int retval; retval = prepare_binprm(bprm); if (retval < 0) return retval; retval = security_bprm_check(bprm); if (retval) return retval; retval = -ENOENT; retry: read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); retval = fmt->load_binary(bprm); read_lock(&binfmt_lock); put_binfmt(fmt); if (bprm->point_of_no_return || (retval != -ENOEXEC)) { read_unlock(&binfmt_lock); return retval; } } read_unlock(&binfmt_lock); if (need_retry) { if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) return retval; need_retry = false; goto retry; } return retval; } static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; int ret, depth; /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); /* This allows 4 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) return -ELOOP; ret = search_binary_handler(bprm); if (ret < 0) return ret; if (!bprm->interpreter) break; exec = bprm->file; bprm->file = bprm->interpreter; bprm->interpreter = NULL; allow_write_access(exec); if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); return -ENOEXEC; } bprm->executable = exec; } else fput(exec); } audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); proc_exec_connector(current); return 0; } /* * sys_execve() executes a new program. */ static int bprm_execve(struct linux_binprm *bprm, int fd, struct filename *filename, int flags) { struct file *file; struct files_struct *displaced; int retval; /* * Cancel any io_uring activity across execve */ io_uring_task_cancel(); retval = unshare_files(&displaced); if (retval) return retval; retval = prepare_bprm_creds(bprm); if (retval) goto out_files; check_unsafe_exec(bprm); current->in_execve = 1; file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; sched_exec(); bprm->file = file; /* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. Relies on having exclusive access to * current->files (due to unshare_files above). */ if (bprm->fdpath && close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) goto out; retval = exec_binprm(bprm); if (retval < 0) goto out; /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); if (displaced) put_files_struct(displaced); return retval; out: /* * If past the point of no return ensure the the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. */ if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_sigsegv(SIGSEGV); out_unmark: current->fs->in_exec = 0; current->in_execve = 0; out_files: if (displaced) reset_files_struct(displaced); return retval; } static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { struct linux_binprm *bprm; int retval; if (IS_ERR(filename)) return PTR_ERR(filename); /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) { retval = -EAGAIN; goto out_ret; } /* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; bprm = alloc_bprm(fd, filename); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count(argv, MAX_ARG_STRINGS); if (retval == 0) pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", current->comm, bprm->filename); if (retval < 0) goto out_free; bprm->argc = retval; retval = count(envp, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out_free; /* * When argv is empty, add an empty string ("") as argv[0] to * ensure confused userspace programs that start processing * from argv[1] won't end up walking envp. See also * bprm_stack_limits(). */ if (bprm->argc == 0) { retval = copy_string_kernel("", bprm); if (retval < 0) goto out_free; bprm->argc = 1; } retval = bprm_execve(bprm, fd, filename, flags); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } int kernel_execve(const char *kernel_filename, const char *const *argv, const char *const *envp) { struct filename *filename; struct linux_binprm *bprm; int fd = AT_FDCWD; int retval; filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); bprm = alloc_bprm(fd, filename); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count_strings_kernel(argv); if (WARN_ON_ONCE(retval == 0)) retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; retval = count_strings_kernel(envp); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings_kernel(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings_kernel(bprm->argc, argv, bprm); if (retval < 0) goto out_free; retval = bprm_execve(bprm, fd, filename, 0); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } static int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int do_execveat(int fd, struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int compat_do_execveat(int fd, struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp, int flags) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(fd, filename, argv, envp, flags); } #endif void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; if (mm->binfmt) module_put(mm->binfmt->module); mm->binfmt = new; if (new) __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); /* * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); } SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp) { return do_execve(getname(filename), argv, envp); } SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp, int, flags) { int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return do_execveat(fd, getname_flags(filename, lookup_flags, NULL), argv, envp, flags); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp) { return compat_do_execve(getname(filename), argv, envp); } COMPAT_SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp, int, flags) { int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return compat_do_execveat(fd, getname_flags(filename, lookup_flags, NULL), argv, envp, flags); } #endif
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0-or-later /* * cn_proc.c - process events connector * * Copyright (C) Matt Helsley, IBM Corp. 2005 * Based on cn_fork.c by Guillaume Thouvenin <guillaume.thouvenin@bull.net> * Original copyright notice follows: * Copyright (C) 2005 BULL SA. */ #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/init.h> #include <linux/connector.h> #include <linux/gfp.h> #include <linux/ptrace.h> #include <linux/atomic.h> #include <linux/pid_namespace.h> #include <linux/cn_proc.h> #include <linux/local_lock.h> /* * Size of a cn_msg followed by a proc_event structure. Since the * sizeof struct cn_msg is a multiple of 4 bytes, but not 8 bytes, we * add one 4-byte word to the size here, and then start the actual * cn_msg structure 4 bytes into the stack buffer. The result is that * the immediately following proc_event structure is aligned to 8 bytes. */ #define CN_PROC_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct proc_event) + 4) /* See comment above; we test our assumption about sizeof struct cn_msg here. */ static inline struct cn_msg *buffer_to_cn_msg(__u8 *buffer) { BUILD_BUG_ON(sizeof(struct cn_msg) != 20); return (struct cn_msg *)(buffer + 4); } static atomic_t proc_event_num_listeners = ATOMIC_INIT(0); static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC }; /* local_event.count is used as the sequence number of the netlink message */ struct local_event { local_lock_t lock; __u32 count; }; static DEFINE_PER_CPU(struct local_event, local_event) = { .lock = INIT_LOCAL_LOCK(lock), }; static inline void send_msg(struct cn_msg *msg) { local_lock(&local_event.lock); msg->seq = __this_cpu_inc_return(local_event.count) - 1; ((struct proc_event *)msg->data)->cpu = smp_processor_id(); /* * local_lock() disables preemption during send to ensure the messages * are ordered according to their sequence numbers. * * If cn_netlink_send() fails, the data is not sent. */ cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT); local_unlock(&local_event.lock); } void proc_fork_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); struct task_struct *parent; if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_FORK; rcu_read_lock(); parent = rcu_dereference(task->real_parent); ev->event_data.fork.parent_pid = parent->pid; ev->event_data.fork.parent_tgid = parent->tgid; rcu_read_unlock(); ev->event_data.fork.child_pid = task->pid; ev->event_data.fork.child_tgid = task->tgid; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_exec_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_EXEC; ev->event_data.exec.process_pid = task->pid; ev->event_data.exec.process_tgid = task->tgid; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_id_connector(struct task_struct *task, int which_id) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); const struct cred *cred; if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->what = which_id; ev->event_data.id.process_pid = task->pid; ev->event_data.id.process_tgid = task->tgid; rcu_read_lock(); cred = __task_cred(task); if (which_id == PROC_EVENT_UID) { ev->event_data.id.r.ruid = from_kuid_munged(&init_user_ns, cred->uid); ev->event_data.id.e.euid = from_kuid_munged(&init_user_ns, cred->euid); } else if (which_id == PROC_EVENT_GID) { ev->event_data.id.r.rgid = from_kgid_munged(&init_user_ns, cred->gid); ev->event_data.id.e.egid = from_kgid_munged(&init_user_ns, cred->egid); } else { rcu_read_unlock(); return; } rcu_read_unlock(); ev->timestamp_ns = ktime_get_ns(); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_sid_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_SID; ev->event_data.sid.process_pid = task->pid; ev->event_data.sid.process_tgid = task->tgid; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_ptrace_connector(struct task_struct *task, int ptrace_id) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_PTRACE; ev->event_data.ptrace.process_pid = task->pid; ev->event_data.ptrace.process_tgid = task->tgid; if (ptrace_id == PTRACE_ATTACH) { ev->event_data.ptrace.tracer_pid = current->pid; ev->event_data.ptrace.tracer_tgid = current->tgid; } else if (ptrace_id == PTRACE_DETACH) { ev->event_data.ptrace.tracer_pid = 0; ev->event_data.ptrace.tracer_tgid = 0; } else return; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_comm_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_COMM; ev->event_data.comm.process_pid = task->pid; ev->event_data.comm.process_tgid = task->tgid; get_task_comm(ev->event_data.comm.comm, task); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_coredump_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; struct task_struct *parent; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_COREDUMP; ev->event_data.coredump.process_pid = task->pid; ev->event_data.coredump.process_tgid = task->tgid; rcu_read_lock(); if (pid_alive(task)) { parent = rcu_dereference(task->real_parent); ev->event_data.coredump.parent_pid = parent->pid; ev->event_data.coredump.parent_tgid = parent->tgid; } rcu_read_unlock(); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } void proc_exit_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; struct task_struct *parent; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); ev->timestamp_ns = ktime_get_ns(); ev->what = PROC_EVENT_EXIT; ev->event_data.exit.process_pid = task->pid; ev->event_data.exit.process_tgid = task->tgid; ev->event_data.exit.exit_code = task->exit_code; ev->event_data.exit.exit_signal = task->exit_signal; rcu_read_lock(); if (pid_alive(task)) { parent = rcu_dereference(task->real_parent); ev->event_data.exit.parent_pid = parent->pid; ev->event_data.exit.parent_tgid = parent->tgid; } rcu_read_unlock(); memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } /* * Send an acknowledgement message to userspace * * Use 0 for success, EFOO otherwise. * Note: this is the negative of conventional kernel error * values because it's not being returned via syscall return * mechanisms. */ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); if (atomic_read(&proc_event_num_listeners) < 1) return; msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); msg->seq = rcvd_seq; ev->timestamp_ns = ktime_get_ns(); ev->cpu = -1; ev->what = PROC_EVENT_NONE; ev->event_data.ack.err = err; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = rcvd_ack + 1; msg->len = sizeof(*ev); msg->flags = 0; /* not used */ send_msg(msg); } /** * cn_proc_mcast_ctl * @data: message sent from userspace via the connector */ static void cn_proc_mcast_ctl(struct cn_msg *msg, struct netlink_skb_parms *nsp) { enum proc_cn_mcast_op *mc_op = NULL; int err = 0; if (msg->len != sizeof(*mc_op)) return; /* * Events are reported with respect to the initial pid * and user namespaces so ignore requestors from * other namespaces. */ if ((current_user_ns() != &init_user_ns) || (task_active_pid_ns(current) != &init_pid_ns)) return; /* Can only change if privileged. */ if (!__netlink_ns_capable(nsp, &init_user_ns, CAP_NET_ADMIN)) { err = EPERM; goto out; } mc_op = (enum proc_cn_mcast_op *)msg->data; switch (*mc_op) { case PROC_CN_MCAST_LISTEN: atomic_inc(&proc_event_num_listeners); break; case PROC_CN_MCAST_IGNORE: atomic_dec(&proc_event_num_listeners); break; default: err = EINVAL; break; } out: cn_proc_ack(err, msg->seq, msg->ack); } /* * cn_proc_init - initialization entry point * * Adds the connector callback to the connector driver. */ static int __init cn_proc_init(void) { int err = cn_add_callback(&cn_proc_event_id, "cn_proc", &cn_proc_mcast_ctl); if (err) { pr_warn("cn_proc failed to register\n"); return err; } return 0; } device_initcall(cn_proc_init);
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TRACE_EVENT_H #define _LINUX_TRACE_EVENT_H #include <linux/ring_buffer.h> #include <linux/trace_seq.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/perf_event.h> #include <linux/tracepoint.h> struct trace_array; struct array_buffer; struct tracer; struct dentry; struct bpf_prog; const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, const struct trace_print_flags *flag_array); const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array); #if BITS_PER_LONG == 32 const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, const struct trace_print_flags_u64 *flag_array); const char *trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array); #endif const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, unsigned int bitmask_size); const char *trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int len, bool concatenate); const char *trace_print_array_seq(struct trace_seq *p, const void *buf, int count, size_t el_size); const char * trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); struct trace_iterator; struct trace_event; int trace_raw_output_prep(struct trace_iterator *iter, struct trace_event *event); extern __printf(2, 3) void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...); /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: * * bash-15816 [01] 235.197585: idle_cpu <- irq_enter */ struct trace_entry { unsigned short type; unsigned char flags; unsigned char preempt_count; int pid; }; #define TRACE_EVENT_TYPE_MAX \ ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) /* * Trace iterator - used by printout routines who present trace * results to users and which routines might sleep, etc: */ struct trace_iterator { struct trace_array *tr; struct tracer *trace; struct array_buffer *array_buffer; void *private; int cpu_file; struct mutex mutex; struct ring_buffer_iter **buffer_iter; unsigned long iter_flags; void *temp; /* temp holder */ unsigned int temp_size; char *fmt; /* modified format holder */ unsigned int fmt_size; /* trace_seq for __print_flags() and __print_symbolic() etc. */ struct trace_seq tmp_seq; cpumask_var_t started; /* it's true when current open file is snapshot */ bool snapshot; /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; unsigned long lost_events; int leftover; int ent_size; int cpu; u64 ts; loff_t pos; long idx; /* All new field here will be zeroed out in pipe_read */ }; enum trace_iter_flags { TRACE_FILE_LAT_FMT = 1, TRACE_FILE_ANNOTATE = 2, TRACE_FILE_TIME_IN_NS = 4, }; typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, int flags, struct trace_event *event); struct trace_event_functions { trace_print_func trace; trace_print_func raw; trace_print_func hex; trace_print_func binary; }; struct trace_event { struct hlist_node node; struct list_head list; int type; struct trace_event_functions *funcs; }; extern int register_trace_event(struct trace_event *event); extern int unregister_trace_event(struct trace_event *event); /* Return values for print_line callback */ enum print_line_t { TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ TRACE_TYPE_HANDLED = 1, TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; enum print_line_t trace_handle_return(struct trace_seq *s); void tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, unsigned long flags, int pc); struct trace_event_file; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct trace_buffer **current_buffer, struct trace_event_file *trace_file, int type, unsigned long len, unsigned long flags, int pc); #define TRACE_RECORD_CMDLINE BIT(0) #define TRACE_RECORD_TGID BIT(1) void tracing_record_taskinfo(struct task_struct *task, int flags); void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags); void tracing_record_cmdline(struct task_struct *task); void tracing_record_tgid(struct task_struct *task); int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); struct event_filter; enum trace_reg { TRACE_REG_REGISTER, TRACE_REG_UNREGISTER, #ifdef CONFIG_PERF_EVENTS TRACE_REG_PERF_REGISTER, TRACE_REG_PERF_UNREGISTER, TRACE_REG_PERF_OPEN, TRACE_REG_PERF_CLOSE, /* * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a * custom action was taken and the default action is not to be * performed. */ TRACE_REG_PERF_ADD, TRACE_REG_PERF_DEL, #endif }; struct trace_event_call; #define TRACE_FUNCTION_TYPE ((const char *)~0UL) struct trace_event_fields { const char *type; union { struct { const char *name; const int size; const int align; const int is_signed; const int filter_type; }; int (*define_fields)(struct trace_event_call *); }; }; struct trace_event_class { const char *system; void *probe; #ifdef CONFIG_PERF_EVENTS void *perf_probe; #endif int (*reg)(struct trace_event_call *event, enum trace_reg type, void *data); struct trace_event_fields *fields_array; struct list_head *(*get_fields)(struct trace_event_call *); struct list_head fields; int (*raw_init)(struct trace_event_call *); }; extern int trace_event_reg(struct trace_event_call *event, enum trace_reg type, void *data); struct trace_event_buffer { struct trace_buffer *buffer; struct ring_buffer_event *event; struct trace_event_file *trace_file; void *entry; unsigned long flags; int pc; struct pt_regs *regs; }; void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len); void trace_event_buffer_commit(struct trace_event_buffer *fbuffer); enum { TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, }; /* * Event flags: * FILTERED - The event has a filter attached * CAP_ANY - Any user can enable for perf * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file * TRACEPOINT - Event is a tracepoint * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) struct trace_event_call { struct list_head list; struct trace_event_class *class; union { char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ struct tracepoint *tp; }; struct trace_event event; char *print_fmt; struct event_filter *filter; void *mod; void *data; /* * bit 0: filter_active * bit 1: allow trace by non root (cap any) * bit 2: failed to apply filter * bit 3: trace internal event (do not enable) * bit 4: Event was enabled by module * bit 5: use call filter rather than file filter * bit 6: Event is a tracepoint */ int flags; /* static flags of different events */ #ifdef CONFIG_PERF_EVENTS int perf_refcount; struct hlist_head __percpu *perf_events; struct bpf_prog_array __rcu *prog_array; int (*perf_perm)(struct trace_event_call *, struct perf_event *); #endif }; #ifdef CONFIG_PERF_EVENTS static inline bool bpf_prog_array_valid(struct trace_event_call *call) { /* * This inline function checks whether call->prog_array * is valid or not. The function is called in various places, * outside rcu_read_lock/unlock, as a heuristic to speed up execution. * * If this function returns true, and later call->prog_array * becomes false inside rcu_read_lock/unlock region, * we bail out then. If this function return false, * there is a risk that we might miss a few events if the checking * were delayed until inside rcu_read_lock/unlock region and * call->prog_array happened to become non-NULL then. * * Here, READ_ONCE() is used instead of rcu_access_pointer(). * rcu_access_pointer() requires the actual definition of * "struct bpf_prog_array" while READ_ONCE() only needs * a declaration of the same type. */ return !!READ_ONCE(call->prog_array); } #endif static inline const char * trace_event_name(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_TRACEPOINT) return call->tp ? call->tp->name : NULL; else return call->name; } static inline struct list_head * trace_get_fields(struct trace_event_call *event_call) { if (!event_call->class->get_fields) return &event_call->class->fields; return event_call->class->get_fields(event_call); } struct trace_array; struct trace_subsystem_dir; enum { EVENT_FILE_FL_ENABLED_BIT, EVENT_FILE_FL_RECORDED_CMD_BIT, EVENT_FILE_FL_RECORDED_TGID_BIT, EVENT_FILE_FL_FILTERED_BIT, EVENT_FILE_FL_NO_SET_FILTER_BIT, EVENT_FILE_FL_SOFT_MODE_BIT, EVENT_FILE_FL_SOFT_DISABLED_BIT, EVENT_FILE_FL_TRIGGER_MODE_BIT, EVENT_FILE_FL_TRIGGER_COND_BIT, EVENT_FILE_FL_PID_FILTER_BIT, EVENT_FILE_FL_WAS_ENABLED_BIT, EVENT_FILE_FL_FREED_BIT, }; extern struct trace_event_file *trace_get_event_file(const char *instance, const char *system, const char *event); extern void trace_put_event_file(struct trace_event_file *file); #define MAX_DYNEVENT_CMD_LEN (2048) enum dynevent_type { DYNEVENT_TYPE_SYNTH = 1, DYNEVENT_TYPE_KPROBE, DYNEVENT_TYPE_NONE, }; struct dynevent_cmd; typedef int (*dynevent_create_fn_t)(struct dynevent_cmd *cmd); struct dynevent_cmd { struct seq_buf seq; const char *event_name; unsigned int n_fields; enum dynevent_type type; dynevent_create_fn_t run_command; void *private_data; }; extern int dynevent_create(struct dynevent_cmd *cmd); extern int synth_event_delete(const char *name); extern void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen); extern int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, struct module *mod, ...); #define synth_event_gen_cmd_start(cmd, name, mod, ...) \ __synth_event_gen_cmd_start(cmd, name, mod, ## __VA_ARGS__, NULL) struct synth_field_desc { const char *type; const char *name; }; extern int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, struct module *mod, struct synth_field_desc *fields, unsigned int n_fields); extern int synth_event_create(const char *name, struct synth_field_desc *fields, unsigned int n_fields, struct module *mod); extern int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, const char *name); extern int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name); extern int synth_event_add_fields(struct dynevent_cmd *cmd, struct synth_field_desc *fields, unsigned int n_fields); #define synth_event_gen_cmd_end(cmd) \ dynevent_create(cmd) struct synth_event; struct synth_event_trace_state { struct trace_event_buffer fbuffer; struct synth_trace_event *entry; struct trace_buffer *buffer; struct synth_event *event; unsigned int cur_field; unsigned int n_u64; bool disabled; bool add_next; bool add_name; }; extern int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...); extern int synth_event_trace_array(struct trace_event_file *file, u64 *vals, unsigned int n_vals); extern int synth_event_trace_start(struct trace_event_file *file, struct synth_event_trace_state *trace_state); extern int synth_event_add_next_val(u64 val, struct synth_event_trace_state *trace_state); extern int synth_event_add_val(const char *field_name, u64 val, struct synth_event_trace_state *trace_state); extern int synth_event_trace_end(struct synth_event_trace_state *trace_state); extern int kprobe_event_delete(const char *name); extern void kprobe_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen); #define kprobe_event_gen_cmd_start(cmd, name, loc, ...) \ __kprobe_event_gen_cmd_start(cmd, false, name, loc, ## __VA_ARGS__, NULL) #define kretprobe_event_gen_cmd_start(cmd, name, loc, ...) \ __kprobe_event_gen_cmd_start(cmd, true, name, loc, ## __VA_ARGS__, NULL) extern int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, const char *name, const char *loc, ...); #define kprobe_event_add_fields(cmd, ...) \ __kprobe_event_add_fields(cmd, ## __VA_ARGS__, NULL) #define kprobe_event_add_field(cmd, field) \ __kprobe_event_add_fields(cmd, field, NULL) extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...); #define kprobe_event_gen_cmd_end(cmd) \ dynevent_create(cmd) #define kretprobe_event_gen_cmd_end(cmd) \ dynevent_create(cmd) /* * Event file flags: * ENABLED - The event is enabled * RECORDED_CMD - The comms should be recorded at sched_switch * RECORDED_TGID - The tgids should be recorded at sched_switch * FILTERED - The event has a filter attached * NO_SET_FILTER - Set when filter has error and is to be ignored * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED * SOFT_DISABLED - When set, do not trace the event (even though its * tracepoint may be enabled) * TRIGGER_MODE - When set, invoke the triggers associated with the event * TRIGGER_COND - When set, one or more triggers has an associated filter * PID_FILTER - When set, the event is filtered based on pid * WAS_ENABLED - Set when enabled to know to clear trace on module removal * FREED - File descriptor is freed, all fields should be considered invalid */ enum { EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), EVENT_FILE_FL_RECORDED_CMD = (1 << EVENT_FILE_FL_RECORDED_CMD_BIT), EVENT_FILE_FL_RECORDED_TGID = (1 << EVENT_FILE_FL_RECORDED_TGID_BIT), EVENT_FILE_FL_FILTERED = (1 << EVENT_FILE_FL_FILTERED_BIT), EVENT_FILE_FL_NO_SET_FILTER = (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT), EVENT_FILE_FL_SOFT_MODE = (1 << EVENT_FILE_FL_SOFT_MODE_BIT), EVENT_FILE_FL_SOFT_DISABLED = (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT), EVENT_FILE_FL_TRIGGER_MODE = (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT), EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), EVENT_FILE_FL_PID_FILTER = (1 << EVENT_FILE_FL_PID_FILTER_BIT), EVENT_FILE_FL_WAS_ENABLED = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT), EVENT_FILE_FL_FREED = (1 << EVENT_FILE_FL_FREED_BIT), }; struct trace_event_file { struct list_head list; struct trace_event_call *event_call; struct event_filter __rcu *filter; struct dentry *dir; struct trace_array *tr; struct trace_subsystem_dir *system; struct list_head triggers; /* * 32 bit flags: * bit 0: enabled * bit 1: enabled cmd record * bit 2: enable/disable with the soft disable bit * bit 3: soft disabled * bit 4: trigger enabled * * Note: The bits must be set atomically to prevent races * from other writers. Reads of flags do not need to be in * sync as they occur in critical sections. But the way flags * is currently used, these changes do not affect the code * except that when a change is made, it may have a slight * delay in propagating the changes to other CPUs due to * caching and such. Which is mostly OK ;-) */ unsigned long flags; atomic_t ref; /* ref count for opened files */ atomic_t sm_ref; /* soft-mode reference counter */ atomic_t tm_ref; /* trigger-mode reference counter */ }; #define __TRACE_EVENT_FLAGS(name, value) \ static int __init trace_init_flags_##name(void) \ { \ event_##name.flags |= value; \ return 0; \ } \ early_initcall(trace_init_flags_##name); #define __TRACE_EVENT_PERF_PERM(name, expr...) \ static int perf_perm_##name(struct trace_event_call *tp_event, \ struct perf_event *p_event) \ { \ return ({ expr; }); \ } \ static int __init trace_init_perf_perm_##name(void) \ { \ event_##name.perf_perm = &perf_perm_##name; \ return 0; \ } \ early_initcall(trace_init_perf_perm_##name); #define PERF_MAX_TRACE_SIZE 2048 #define MAX_FILTER_STR_VAL 256U /* Should handle KSYM_SYMBOL_LEN */ enum event_trigger_type { ETT_NONE = (0), ETT_TRACE_ONOFF = (1 << 0), ETT_SNAPSHOT = (1 << 1), ETT_STACKTRACE = (1 << 2), ETT_EVENT_ENABLE = (1 << 3), ETT_EVENT_HIST = (1 << 4), ETT_HIST_ENABLE = (1 << 5), }; extern int filter_match_preds(struct event_filter *filter, void *rec); extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, void *rec, struct ring_buffer_event *event); extern void event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); /** * trace_trigger_soft_disabled - do triggers and test if soft disabled * @file: The file pointer of the event to test * * If any triggers without filters are attached to this event, they * will be called here. If the event is soft disabled and has no * triggers that require testing the fields, it will return true, * otherwise false. */ static inline bool trace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { if (eflags & EVENT_FILE_FL_TRIGGER_MODE) event_triggers_call(file, NULL, NULL); if (eflags & EVENT_FILE_FL_SOFT_DISABLED) return true; if (eflags & EVENT_FILE_FL_PID_FILTER) return trace_event_ignore_this_pid(file); } return false; } #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { return 1; } static inline int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { return -EOPNOTSUPP; } static inline void perf_event_detach_bpf_prog(struct perf_event *event) { } static inline int perf_event_query_prog_array(struct perf_event *event, void __user *info) { return -EOPNOTSUPP; } static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) { return -EOPNOTSUPP; } static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) { return -EOPNOTSUPP; } static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { return NULL; } static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { } static inline int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr) { return -EOPNOTSUPP; } #endif enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, FILTER_DYN_STRING, FILTER_PTR_STRING, FILTER_TRACE_FN, FILTER_COMM, FILTER_CPU, }; extern int trace_event_raw_init(struct trace_event_call *call); extern int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type); extern int trace_add_event_call(struct trace_event_call *call); extern int trace_remove_event_call(struct trace_event_call *call); extern int trace_event_get_offsets(struct trace_event_call *call); #define is_signed_type(type) (((type)(-1)) < (type)1) int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); int trace_array_set_clr_event(struct trace_array *tr, const char *system, const char *event, bool enable); /* * The double __builtin_constant_p is because gcc will give us an error * if we try to allocate the static variable to fmt if it is not a * constant. Even with the outer if statement optimizing out. */ #define event_trace_printk(ip, fmt, args...) \ do { \ __trace_printk_check_format(fmt, ##args); \ tracing_record_cmdline(current); \ if (__builtin_constant_p(fmt)) { \ static const char *trace_printk_fmt \ __section("__trace_printk_fmt") = \ __builtin_constant_p(fmt) ? fmt : NULL; \ \ __trace_bprintk(ip, trace_printk_fmt, ##args); \ } else \ __trace_printk(ip, fmt, ##args); \ } while (0) #ifdef CONFIG_PERF_EVENTS struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); extern int perf_trace_add(struct perf_event *event, int flags); extern void perf_trace_del(struct perf_event *event, int flags); #ifdef CONFIG_KPROBE_EVENTS extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); extern int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, const char **symbol, u64 *probe_offset, u64 *probe_addr, bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS extern int perf_uprobe_init(struct perf_event *event, unsigned long ref_ctr_offset, bool is_retprobe); extern void perf_uprobe_destroy(struct perf_event *event); extern int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, const char **filename, u64 *probe_offset, u64 *probe_addr, bool perf_type_tracepoint); #endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3); void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4); void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5); void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6); void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8); void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9); void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10); void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11); void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task); static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type, u64 count, struct pt_regs *regs, void *head, struct task_struct *task) { perf_tp_event(type, count, raw_data, size, regs, head, rctx, task); } #endif #endif /* _LINUX_TRACE_EVENT_H */
8 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion * * Copyright IBM Corporation, 2001 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - * http://lse.sourceforge.net/locking/rcupdate.html * */ #ifndef __LINUX_RCUPDATE_H #define __LINUX_RCUPDATE_H #include <linux/types.h> #include <linux/compiler.h> #include <linux/atomic.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <asm/processor.h> #include <linux/cpumask.h> #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define ulong2long(a) (*(long *)(&(a))) #define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b))) #define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b))) /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); #ifdef CONFIG_PREEMPT_RCU void __rcu_read_lock(void); void __rcu_read_unlock(void); /* * Defined as a macro as it is a very low level header included from * areas that don't even know about current. This gives the rcu_read_lock() * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ #define rcu_preempt_depth() (current->rcu_read_lock_nesting) #else /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TINY_RCU #define rcu_read_unlock_strict() do { } while (0) #else void rcu_read_unlock_strict(void); #endif static inline void __rcu_read_lock(void) { preempt_disable(); } static inline void __rcu_read_unlock(void) { preempt_enable(); rcu_read_unlock_strict(); } static inline int rcu_preempt_depth(void) { return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* Internal to kernel */ void rcu_init(void); extern int rcu_scheduler_active __read_mostly; void rcu_sched_clock_irq(int user); void rcu_report_dead(unsigned int cpu); void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_TASKS_RCU_GENERIC void rcu_init_tasks_generic(void); #else static inline void rcu_init_tasks_generic(void) { } #endif #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); #else /* #ifdef CONFIG_RCU_STALL_COMMON */ static inline void rcu_sysrq_start(void) { } static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ #ifdef CONFIG_NO_HZ_FULL void rcu_user_enter(void); void rcu_user_exit(void); #else static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } #endif /* CONFIG_NO_HZ_FULL */ #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); void rcu_nocb_flush_deferred_wakeup(void); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } static inline void rcu_nocb_flush_deferred_wakeup(void) { } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /** * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers * @a: Code that RCU needs to pay attention to. * * RCU read-side critical sections are forbidden in the inner idle loop, * that is, between the rcu_idle_enter() and the rcu_idle_exit() -- RCU * will happily ignore any such read-side critical sections. However, * things like powertop need tracepoints in the inner idle loop. * * This macro provides the way out: RCU_NONIDLE(do_something_with_RCU()) * will tell RCU that it needs to pay attention, invoke its argument * (in this example, calling the do_something_with_RCU() function), * and then tell RCU to go back to ignoring this CPU. It is permissible * to nest RCU_NONIDLE() wrappers, but not indefinitely (but the limit is * on the order of a million or so, even on 32-bit systems). It is * not legal to block within RCU_NONIDLE(), nor is it permissible to * transfer control either into or out of RCU_NONIDLE()'s statement. */ #define RCU_NONIDLE(a) \ do { \ rcu_irq_enter_irqson(); \ do { a; } while (0); \ rcu_irq_exit_irqson(); \ } while (0) /* * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU_GENERIC # ifdef CONFIG_TASKS_RCU # define rcu_tasks_classic_qs(t, preempt) \ do { \ if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); # else # define rcu_tasks_classic_qs(t, preempt) do { } while (0) # define call_rcu_tasks call_rcu # define synchronize_rcu_tasks synchronize_rcu # endif # ifdef CONFIG_TASKS_TRACE_RCU # define rcu_tasks_trace_qs(t) \ do { \ if (!likely(READ_ONCE((t)->trc_reader_checked)) && \ !unlikely(READ_ONCE((t)->trc_reader_nesting))) { \ smp_store_release(&(t)->trc_reader_checked, true); \ smp_mb(); /* Readers partitioned by store. */ \ } \ } while (0) # else # define rcu_tasks_trace_qs(t) do { } while (0) # endif #define rcu_tasks_qs(t, preempt) \ do { \ rcu_tasks_classic_qs((t), (preempt)); \ rcu_tasks_trace_qs((t)); \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks_rude(void); # endif #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); void exit_tasks_rcu_stop(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_stop(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ /** * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period? * * As an accident of implementation, an RCU Tasks Trace grace period also * acts as an RCU grace period. However, this could change at any time. * Code relying on this accident must call this function to verify that * this accident is still happening. * * You have been warned! */ static inline bool rcu_trace_implies_rcu_gp(void) { return true; } /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() * machinery were to be shut off, as some advocate for PREEMPTION kernels. */ #define cond_resched_tasks_rcu_qs() \ do { \ rcu_tasks_qs(current, false); \ cond_resched(); \ } while (0) /** * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states * @old_ts: jiffies at start of processing. * * This helper is for long-running softirq handlers, such as NAPI threads in * networking. The caller should initialize the variable passed in as @old_ts * at the beginning of the softirq handler. When invoked frequently, this macro * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will * provide both RCU and RCU-Tasks quiescent states. Note that this macro * modifies its old_ts argument. * * Because regions of code that have disabled softirq act as RCU read-side * critical sections, this macro should be invoked with softirq (and * preemption) enabled. * * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would * have more chance to invoke schedule() calls and provide necessary quiescent * states. As a contrast, calling cond_resched() only won't achieve the same * effect because cond_resched() does not provide RCU-Tasks quiescent states. */ #define rcu_softirq_qs_periodic(old_ts) \ do { \ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \ time_after(jiffies, (old_ts) + HZ / 10)) { \ preempt_disable(); \ rcu_softirq_qs(); \ preempt_enable(); \ (old_ts) = jiffies; \ } \ } while (0) /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. */ #if defined(CONFIG_TREE_RCU) #include <linux/rcutree.h> #elif defined(CONFIG_TINY_RCU) #include <linux/rcutiny.h> #else #error "Unknown RCU implementation specified to kernel configuration" #endif /* * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls * are needed for dynamic initialization and destruction of rcu_head * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for * dynamic initialization and destruction of statically allocated rcu_head * structures. However, rcu_head structures allocated dynamically in the * heap don't need any initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head); void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ static inline void init_rcu_head(struct rcu_head *head) { } static inline void destroy_rcu_head(struct rcu_head *head) { } static inline void init_rcu_head_on_stack(struct rcu_head *head) { } static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ static inline bool rcu_lockdep_current_cpu_online(void) { return true; } #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void rcu_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); } static inline void rcu_lock_release(struct lockdep_map *map) { lock_release(map, _THIS_IP_); } extern struct lockdep_map rcu_lock_map; extern struct lockdep_map rcu_bh_lock_map; extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; int debug_lockdep_rcu_enabled(void); int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); int rcu_read_lock_sched_held(void); int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ # define rcu_lock_acquire(a) do { } while (0) # define rcu_lock_release(a) do { } while (0) static inline int rcu_read_lock_held(void) { return 1; } static inline int rcu_read_lock_bh_held(void) { return 1; } static inline int rcu_read_lock_sched_held(void) { return !preemptible(); } static inline int rcu_read_lock_any_held(void) { return !preemptible(); } #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU /** * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met * @c: condition to check * @s: informative message * * This checks debug_lockdep_rcu_enabled() before checking (c) to * prevent early boot splats due to lockdep not yet being initialized, * and rechecks it after checking (c) to prevent false-positive splats * due to races with lockdep being disabled. See commit 3066820034b5dd * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail. */ #define RCU_LOCKDEP_WARN(c, s) \ do { \ static bool __section(".data.unlikely") __warned; \ if (debug_lockdep_rcu_enabled() && (c) && \ debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ } while (0) #if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU) static inline void rcu_preempt_sleep_check(void) { RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), "Illegal context switch in RCU read-side critical section"); } #else /* #ifdef CONFIG_PROVE_RCU */ static inline void rcu_preempt_sleep_check(void) { } #endif /* #else #ifdef CONFIG_PROVE_RCU */ #define rcu_sleep_check() \ do { \ rcu_preempt_sleep_check(); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ "Illegal context switch in RCU-bh read-side critical section"); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) #else /* #ifdef CONFIG_PROVE_RCU */ #define RCU_LOCKDEP_WARN(c, s) do { } while (0) #define rcu_sleep_check() do { } while (0) #endif /* #else #ifdef CONFIG_PROVE_RCU */ /* * Helper functions for rcu_dereference_check(), rcu_dereference_protected() * and rcu_assign_pointer(). Some of these could be folded into their * callers, but they are left separate in order to ease introduction of * multiple pointers markings to match different RCU implementations * (e.g., __srcu), should this make sense in the future. */ #ifdef __CHECKER__ #define rcu_check_sparse(p, space) \ ((void)(((typeof(*p) space *)p) == p)) #else /* #ifdef __CHECKER__ */ #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ /** * unrcu_pointer - mark a pointer as not being RCU protected * @p: pointer needing to lose its __rcu property * * Converts @p from an __rcu pointer to a __kernel pointer. * This allows an __rcu pointer to be used with xchg() and friends. */ #define unrcu_pointer(p) \ ({ \ typeof(*p) *_________p1 = (typeof(*p) *__force)(p); \ rcu_check_sparse(p, __rcu); \ ((typeof(*p) __force __kernel *)(_________p1)); \ }) #define __rcu_access_pointer(p, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(_________p1)); \ }) #define __rcu_dereference_check(p, c, space) \ ({ \ /* Dependency order vs. p above. */ \ typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) #define __rcu_dereference_protected(p, c, space) \ ({ \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) #define rcu_dereference_raw(p) \ ({ \ /* Dependency order vs. p above. */ \ typeof(p) ________p1 = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) /** * RCU_INITIALIZER() - statically initialize an RCU-protected global variable * @v: The value to statically initialize with. */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to * @v: value to assign (publish) * * Assigns the specified value to the specified RCU-protected * pointer, ensuring that any concurrent RCU readers will see * any prior initialization. * * Inserts memory barriers on architectures that require them * (which is most of them), and also prevents the compiler from * reordering the code that initializes the structure after the pointer * assignment. More importantly, this call documents which pointers * will be dereferenced by RCU read-side code. * * In some special cases, you may use RCU_INIT_POINTER() instead * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due * to the fact that it does not constrain either the CPU or the compiler. * That said, using RCU_INIT_POINTER() when you should have used * rcu_assign_pointer() is a very bad thing that results in * impossible-to-diagnose memory corruption. So please be careful. * See the RCU_INIT_POINTER() comment header for details. * * Note that rcu_assign_pointer() evaluates each of its arguments only * once, appearances notwithstanding. One of the "extra" evaluations * is in typeof() and the other visible only to sparse (__CHECKER__), * neither of which actually execute the argument. As with most cpp * macros, this execute-arguments-only-once property is important, so * please be careful when making changes to rcu_assign_pointer() and the * other macros that it invokes. */ #define rcu_assign_pointer(p, v) \ do { \ uintptr_t _r_a_p__v = (uintptr_t)(v); \ rcu_check_sparse(p, __rcu); \ \ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ else \ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ } while (0) /** * rcu_replace_pointer() - replace an RCU pointer, returning its old value * @rcu_ptr: RCU pointer, whose old value is returned * @ptr: regular pointer * @c: the lockdep conditions under which the dereference will take place * * Perform a replacement, where @rcu_ptr is an RCU-annotated * pointer and @c is the lockdep argument that is passed to the * rcu_dereference_protected() call used to read that pointer. The old * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr. */ #define rcu_replace_pointer(rcu_ptr, ptr, c) \ ({ \ typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ __tmp; \ }) /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing * @p: The pointer to read * * Return the value of the specified RCU-protected pointer, but omit the * lockdep checks for being in an RCU read-side critical section. This is * useful when the value of this pointer is accessed, but the pointer is * not dereferenced, for example, when testing an RCU-protected pointer * against NULL. Although rcu_access_pointer() may also be used in cases * where update-side locks prevent the value of the pointer from changing, * you should instead use rcu_dereference_protected() for this use case. * * It is also permissible to use rcu_access_pointer() when read-side * access to the pointer was removed at least one grace period ago, as * is the case in the context of the RCU callback that is freeing up * the data, or after a synchronize_rcu() returns. This can be useful * when tearing down multi-linked structures after a grace period * has elapsed. */ #define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu) /** * rcu_dereference_check() - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Do an rcu_dereference(), but check that the conditions under which the * dereference will take place are correct. Typically the conditions * indicate the various locking conditions that should be held at that * point. The check should return true if the conditions are satisfied. * An implicit check for being in an RCU read-side critical section * (rcu_read_lock()) is included. * * For example: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock)); * * could be used to indicate to lockdep that foo->bar may only be dereferenced * if either rcu_read_lock() is held, or that the lock required to replace * the bar struct at foo->bar is held. * * Note that the list of conditions may also include indications of when a lock * need not be held, for example during initialisation or destruction of the * target struct: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) || * atomic_read(&foo->usage) == 0); * * Inserts memory barriers on architectures that require them * (currently only the Alpha), prevents the compiler from refetching * (and from merging fetches), and, more importantly, documents exactly * which pointers are protected by RCU and checks that the pointer is * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu) /** * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-bh counterpart to rcu_dereference_check(). */ #define rcu_dereference_bh_check(p, c) \ __rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu) /** * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-sched counterpart to rcu_dereference_check(). */ #define rcu_dereference_sched_check(p, c) \ __rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \ __rcu) /* * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. * * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ #define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit * the READ_ONCE(). This is useful in cases where update-side locks * prevent the value of the pointer from changing. Please note that this * primitive does *not* prevent the compiler from repeating this reference * or combining it with other references, so it should not be used without * protection of appropriate locks. * * This function is only for update-side use. Using this function * when protected only by rcu_read_lock() will result in infrequent * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ __rcu_dereference_protected((p), (c), __rcu) /** * rcu_dereference() - fetch RCU-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * This is a simple wrapper around rcu_dereference_check(). */ #define rcu_dereference(p) rcu_dereference_check(p, 0) /** * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0) /** * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) /** * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism * @p: The pointer to hand off * * This is simply an identity function, but it documents where a pointer * is handed off from RCU to some other synchronization mechanism, for * example, reference counting or locking. In C11, it would map to * kill_dependency(). It could be used as follows:: * * rcu_read_lock(); * p = rcu_dereference(gp); * long_lived = is_long_lived(p); * if (long_lived) { * if (!atomic_inc_not_zero(p->refcnt)) * long_lived = false; * else * p = rcu_pointer_handoff(p); * } * rcu_read_unlock(); */ #define rcu_pointer_handoff(p) (p) /** * rcu_read_lock() - mark the beginning of an RCU read-side critical section * * When synchronize_rcu() is invoked on one CPU while other CPUs * are within RCU read-side critical sections, then the * synchronize_rcu() is guaranteed to block until after all the other * CPUs exit their critical sections. Similarly, if call_rcu() is invoked * on one CPU while other CPUs are within RCU read-side critical * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU * callback is invoked. This is legal, because the RCU read-side critical * section that was running concurrently with the call_rcu() (and which * therefore might be referencing something that the corresponding RCU * callback would free up) has completed before the corresponding * RCU callback is invoked. * * RCU read-side critical sections may be nested. Any deferred actions * will be deferred until the outermost RCU read-side critical section * completes. * * You can avoid reading and understanding the next paragraph by * following this rule: don't put anything in an rcu_read_lock() RCU * read-side critical section that would block in a !PREEMPTION kernel. * But if you want the full story, read on! * * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, * but explicit blocking is illegal. Finally, in preemptible RCU * implementations in real-time (with -rt patchset) kernel builds, RCU * read-side critical sections may be preempted and they may also block, but * only when acquiring spinlocks that are subject to priority inheritance. */ static __always_inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); rcu_lock_acquire(&rcu_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock() used illegally while idle"); } /* * So where is rcu_write_lock()? It does not exist, as there is no * way for writers to lock out RCU readers. This is a feature, not * a bug -- this property is what provides RCU's performance benefits. * Of course, writers must coordinate with each other. The normal * spinlock primitives work well for this, but any other technique may be * used as well. RCU does not care how the writers keep out of each * others' way, as long as they do so. */ /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In most situations, rcu_read_unlock() is immune from deadlock. * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock() * is responsible for deboosting, which it does via rt_mutex_unlock(). * Unfortunately, this function acquires the scheduler's runqueue and * priority-inheritance spinlocks. This means that deadlock could result * if the caller of rcu_read_unlock() already holds one of these locks or * any lock that is ever acquired while holding them. * * That said, RCU readers are never priority boosted unless they were * preempted. Therefore, one way to avoid deadlock is to make sure * that preemption never happens within any RCU read-side critical * section whose outermost rcu_read_unlock() is called with one of * rt_mutex_unlock()'s locks held. Such preemption can be avoided in * a number of ways, for example, by invoking preempt_disable() before * critical section's outermost rcu_read_lock(). * * Given that the set of locks acquired by rt_mutex_unlock() might change * at any time, a somewhat more future-proofed approach is to make sure * that that preemption never happens within any RCU read-side critical * section whose outermost rcu_read_unlock() is called with irqs disabled. * This approach relies on the fact that rt_mutex_unlock() currently only * acquires irq-disabled locks. * * The second of these two approaches is best in most situations, * however, the first approach can also be useful, at least to those * developers willing to keep abreast of the set of locks acquired by * rt_mutex_unlock(). * * See rcu_read_lock() for more information. */ static inline void rcu_read_unlock(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock() used illegally while idle"); __release(RCU); __rcu_read_unlock(); rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ } /** * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * * This is equivalent of rcu_read_lock(), but also disables softirqs. * Note that anything else that disables softirqs can also serve as * an RCU read-side critical section. * * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh() * was invoked from some other task. */ static inline void rcu_read_lock_bh(void) { local_bh_disable(); __acquire(RCU_BH); rcu_lock_acquire(&rcu_bh_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_bh() used illegally while idle"); } /** * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ static inline void rcu_read_unlock_bh(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_bh() used illegally while idle"); rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); local_bh_enable(); } /** * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * * This is equivalent of rcu_read_lock(), but disables preemption. * Read-side critical sections can also be introduced by anything else * that disables preemption, including local_irq_disable() and friends. * * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_sched() from process context if the matching * rcu_read_lock_sched() was invoked from an NMI handler. */ static inline void rcu_read_lock_sched(void) { preempt_disable(); __acquire(RCU_SCHED); rcu_lock_acquire(&rcu_sched_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_sched() used illegally while idle"); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_lock_sched_notrace(void) { preempt_disable_notrace(); __acquire(RCU_SCHED); } /** * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section * * See rcu_read_lock_sched() for more information. */ static inline void rcu_read_unlock_sched(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_sched() used illegally while idle"); rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); preempt_enable(); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_unlock_sched_notrace(void) { __release(RCU_SCHED); preempt_enable_notrace(); } /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * Initialize an RCU-protected pointer in special cases where readers * do not need ordering constraints on the CPU or the compiler. These * special cases are: * * 1. This use of RCU_INIT_POINTER() is NULLing out the pointer *or* * 2. The caller has taken whatever steps are required to prevent * RCU readers from concurrently accessing this pointer *or* * 3. The referenced data structure has already been exposed to * readers either at compile time or via rcu_assign_pointer() *and* * * a. You have not made *any* reader-visible changes to * this structure since then *or* * b. It is OK for readers accessing this structure from its * new location to see the old state of the structure. (For * example, the changes were to statistical counters or to * other state where exact synchronization is not required.) * * Failure to follow these rules governing use of RCU_INIT_POINTER() will * result in impossible-to-diagnose memory corruption. As in the structures * will look OK in crash dumps, but any concurrent RCU readers might * see pre-initialized values of the referenced data structure. So * please be very careful how you use RCU_INIT_POINTER()!!! * * If you are creating an RCU-protected linked structure that is accessed * by a single external-to-structure RCU-protected pointer, then you may * use RCU_INIT_POINTER() to initialize the internal RCU-protected * pointers, but you must use rcu_assign_pointer() to initialize the * external-to-structure pointer *after* you have completely initialized * the reader-accessible portions of the linked structure. * * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no * ordering guarantees for either the CPU or the compiler. */ #define RCU_INIT_POINTER(p, v) \ do { \ rcu_check_sparse(p, __rcu); \ WRITE_ONCE(p, RCU_INITIALIZER(v)); \ } while (0) /** * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * GCC-style initialization for an RCU-protected pointer in a structure field. */ #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) /* * Does the specified offset indicate that the corresponding rcu_head * structure can be handled by kvfree_rcu()? */ #define __is_kvfree_rcu_offset(offset) ((offset) < 4096) /* * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain. */ #define __kvfree_rcu(head, offset) \ do { \ BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \ kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ } while (0) /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree * @rhf: the name of the struct rcu_head within the type of @ptr. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * * The kfree_rcu() function handles this issue. Rather than encoding a * function address in the embedded rcu_head structure, kfree_rcu() instead * encodes the offset of the rcu_head structure within the base structure. * Because the functions are not allowed in the low-order 4096 bytes of * kernel virtual memory, offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in __kvfree_rcu(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * * Note that the allowable offset might decrease in the future, for example, * to allow something like kmem_cache_free_rcu(). * * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ #define kfree_rcu(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) \ __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ } while (0) /** * kvfree_rcu() - kvfree an object after a grace period. * * This macro consists of one or two arguments and it is * based on whether an object is head-less or not. If it * has a head then a semantic stays the same as it used * to be before: * * kvfree_rcu(ptr, rhf); * * where @ptr is a pointer to kvfree(), @rhf is the name * of the rcu_head structure within the type of @ptr. * * When it comes to head-less variant, only one argument * is passed and that is just a pointer which has to be * freed after a grace period. Therefore the semantic is * * kvfree_rcu(ptr); * * where @ptr is a pointer to kvfree(). * * Please note, head-less way of freeing is permitted to * use from a context that has to follow might_sleep() * annotation. Otherwise, please switch and embed the * rcu_head structure within the type of @ptr. */ #define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \ kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__) #define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME #define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf) #define kvfree_rcu_arg_1(ptr) \ do { \ typeof(ptr) ___p = (ptr); \ \ if (___p) \ kvfree_call_rcu(NULL, (rcu_callback_t) (___p)); \ } while (0) /* * Place this after a lock-acquisition primitive to guarantee that * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ #else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ /* Has the specified rcu_head structure been handed to call_rcu()? */ /** * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu() * @rhp: The rcu_head structure to initialize. * * If you intend to invoke rcu_head_after_call_rcu() to test whether a * given rcu_head structure has already been passed to call_rcu(), then * you must also invoke this rcu_head_init() function on it just after * allocating that structure. Calls to this function must not race with * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation. */ static inline void rcu_head_init(struct rcu_head *rhp) { rhp->func = (rcu_callback_t)~0L; } /** * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()? * @rhp: The rcu_head structure to test. * @f: The function passed to call_rcu() along with @rhp. * * Returns @true if the @rhp has been passed to call_rcu() with @func, * and @false otherwise. Emits a warning in any other case, including * the case where @rhp has already been invoked after a grace period. * Calls to this function must not race with callback invocation. One way * to avoid such races is to enclose the call to rcu_head_after_call_rcu() * in an RCU read-side critical section that includes a read-side fetch * of the pointer to the structure containing @rhp. */ static inline bool rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) { rcu_callback_t func = READ_ONCE(rhp->func); if (func == f) return true; WARN_ON_ONCE(func != (rcu_callback_t)~0L); return false; } /* kernel/ksysfs.c definitions */ extern int rcu_expedited; extern int rcu_normal; #endif /* __LINUX_RCUPDATE_H */
2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_UIDGID_H #define _LINUX_UIDGID_H /* * A set of types for the internal kernel types representing uids and gids. * * The types defined in this header allow distinguishing which uids and gids in * the kernel are values used by userspace and which uid and gid values are * the internal kernel values. With the addition of user namespaces the values * can be different. Using the type system makes it possible for the compiler * to detect when we overlook these differences. * */ #include <linux/types.h> #include <linux/highuid.h> struct user_namespace; extern struct user_namespace init_user_ns; typedef struct { uid_t val; } kuid_t; typedef struct { gid_t val; } kgid_t; #define KUIDT_INIT(value) (kuid_t){ value } #define KGIDT_INIT(value) (kgid_t){ value } #ifdef CONFIG_MULTIUSER static inline uid_t __kuid_val(kuid_t uid) { return uid.val; } static inline gid_t __kgid_val(kgid_t gid) { return gid.val; } #else static inline uid_t __kuid_val(kuid_t uid) { return 0; } static inline gid_t __kgid_val(kgid_t gid) { return 0; } #endif #define GLOBAL_ROOT_UID KUIDT_INIT(0) #define GLOBAL_ROOT_GID KGIDT_INIT(0) #define INVALID_UID KUIDT_INIT(-1) #define INVALID_GID KGIDT_INIT(-1) static inline bool uid_eq(kuid_t left, kuid_t right) { return __kuid_val(left) == __kuid_val(right); } static inline bool gid_eq(kgid_t left, kgid_t right) { return __kgid_val(left) == __kgid_val(right); } static inline bool uid_gt(kuid_t left, kuid_t right) { return __kuid_val(left) > __kuid_val(right); } static inline bool gid_gt(kgid_t left, kgid_t right) { return __kgid_val(left) > __kgid_val(right); } static inline bool uid_gte(kuid_t left, kuid_t right) { return __kuid_val(left) >= __kuid_val(right); } static inline bool gid_gte(kgid_t left, kgid_t right) { return __kgid_val(left) >= __kgid_val(right); } static inline bool uid_lt(kuid_t left, kuid_t right) { return __kuid_val(left) < __kuid_val(right); } static inline bool gid_lt(kgid_t left, kgid_t right) { return __kgid_val(left) < __kgid_val(right); } static inline bool uid_lte(kuid_t left, kuid_t right) { return __kuid_val(left) <= __kuid_val(right); } static inline bool gid_lte(kgid_t left, kgid_t right) { return __kgid_val(left) <= __kgid_val(right); } static inline bool uid_valid(kuid_t uid) { return __kuid_val(uid) != (uid_t) -1; } static inline bool gid_valid(kgid_t gid) { return __kgid_val(gid) != (gid_t) -1; } #ifdef CONFIG_USER_NS extern kuid_t make_kuid(struct user_namespace *from, uid_t uid); extern kgid_t make_kgid(struct user_namespace *from, gid_t gid); extern uid_t from_kuid(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid(struct user_namespace *to, kgid_t gid); extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid); static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return from_kuid(ns, uid) != (uid_t) -1; } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return from_kgid(ns, gid) != (gid_t) -1; } #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) { return KUIDT_INIT(uid); } static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid) { return KGIDT_INIT(gid); } static inline uid_t from_kuid(struct user_namespace *to, kuid_t kuid) { return __kuid_val(kuid); } static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid) { return __kgid_val(kgid); } static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid) { uid_t uid = from_kuid(to, kuid); if (uid == (uid_t)-1) uid = overflowuid; return uid; } static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid) { gid_t gid = from_kgid(to, kgid); if (gid == (gid_t)-1) gid = overflowgid; return gid; } static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return uid_valid(uid); } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return gid_valid(gid); } #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */
8 8 8 8 8 8 8 8 8 8 8 4 4 1 1 3 3 3 4 4 4 1 4 1 4 3 4 4 4 3 4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 // SPDX-License-Identifier: GPL-2.0-or-later /* Task credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/cred.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/coredump.h> #include <linux/key.h> #include <linux/keyctl.h> #include <linux/init_task.h> #include <linux/security.h> #include <linux/binfmts.h> #include <linux/cn_proc.h> #include <linux/uidgid.h> #if 0 #define kdebug(FMT, ...) \ printk("[%-5.5s%5u] " FMT "\n", \ current->comm, current->pid, ##__VA_ARGS__) #else #define kdebug(FMT, ...) \ do { \ if (0) \ no_printk("[%-5.5s%5u] " FMT "\n", \ current->comm, current->pid, ##__VA_ARGS__); \ } while (0) #endif static struct kmem_cache *cred_jar; /* init to 2 - one for init_task, one to ensure it is never freed */ struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; /* * The initial credentials for the initial task */ struct cred init_cred = { .usage = ATOMIC_INIT(4), #ifdef CONFIG_DEBUG_CREDENTIALS .subscribers = ATOMIC_INIT(2), .magic = CRED_MAGIC, #endif .uid = GLOBAL_ROOT_UID, .gid = GLOBAL_ROOT_GID, .suid = GLOBAL_ROOT_UID, .sgid = GLOBAL_ROOT_GID, .euid = GLOBAL_ROOT_UID, .egid = GLOBAL_ROOT_GID, .fsuid = GLOBAL_ROOT_UID, .fsgid = GLOBAL_ROOT_GID, .securebits = SECUREBITS_DEFAULT, .cap_inheritable = CAP_EMPTY_SET, .cap_permitted = CAP_FULL_SET, .cap_effective = CAP_FULL_SET, .cap_bset = CAP_FULL_SET, .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, }; static inline void set_cred_subscribers(struct cred *cred, int n) { #ifdef CONFIG_DEBUG_CREDENTIALS atomic_set(&cred->subscribers, n); #endif } static inline int read_cred_subscribers(const struct cred *cred) { #ifdef CONFIG_DEBUG_CREDENTIALS return atomic_read(&cred->subscribers); #else return 0; #endif } static inline void alter_cred_subscribers(const struct cred *_cred, int n) { #ifdef CONFIG_DEBUG_CREDENTIALS struct cred *cred = (struct cred *) _cred; atomic_add(n, &cred->subscribers); #endif } /* * The RCU callback to actually dispose of a set of credentials */ static void put_cred_rcu(struct rcu_head *rcu) { struct cred *cred = container_of(rcu, struct cred, rcu); kdebug("put_cred_rcu(%p)", cred); #ifdef CONFIG_DEBUG_CREDENTIALS if (cred->magic != CRED_MAGIC_DEAD || atomic_long_read(&cred->usage) != 0 || read_cred_subscribers(cred) != 0) panic("CRED: put_cred_rcu() sees %p with" " mag %x, put %p, usage %ld, subscr %d\n", cred, cred->magic, cred->put_addr, atomic_long_read(&cred->usage), read_cred_subscribers(cred)); #else if (atomic_long_read(&cred->usage) != 0) panic("CRED: put_cred_rcu() sees %p with usage %ld\n", cred, atomic_long_read(&cred->usage)); #endif security_cred_free(cred); key_put(cred->session_keyring); key_put(cred->process_keyring); key_put(cred->thread_keyring); key_put(cred->request_key_auth); if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); put_user_ns(cred->user_ns); kmem_cache_free(cred_jar, cred); } /** * __put_cred - Destroy a set of credentials * @cred: The record to release * * Destroy a set of credentials on which no references remain. */ void __put_cred(struct cred *cred) { kdebug("__put_cred(%p{%ld,%d})", cred, atomic_long_read(&cred->usage), read_cred_subscribers(cred)); BUG_ON(atomic_long_read(&cred->usage) != 0); #ifdef CONFIG_DEBUG_CREDENTIALS BUG_ON(read_cred_subscribers(cred) != 0); cred->magic = CRED_MAGIC_DEAD; cred->put_addr = __builtin_return_address(0); #endif BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); if (cred->non_rcu) put_cred_rcu(&cred->rcu); else call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); /* * Clean up a task's credentials when it exits */ void exit_creds(struct task_struct *tsk) { struct cred *cred; kdebug("exit_creds(%u,%p,%p,{%ld,%d})", tsk->pid, tsk->real_cred, tsk->cred, atomic_long_read(&tsk->cred->usage), read_cred_subscribers(tsk->cred)); cred = (struct cred *) tsk->real_cred; tsk->real_cred = NULL; validate_creds(cred); alter_cred_subscribers(cred, -1); put_cred(cred); cred = (struct cred *) tsk->cred; tsk->cred = NULL; validate_creds(cred); alter_cred_subscribers(cred, -1); put_cred(cred); #ifdef CONFIG_KEYS_REQUEST_CACHE key_put(tsk->cached_requested_key); tsk->cached_requested_key = NULL; #endif } /** * get_task_cred - Get another task's objective credentials * @task: The task to query * * Get the objective credentials of a task, pinning them so that they can't go * away. Accessing a task's credentials directly is not permitted. * * The caller must also make sure task doesn't get deleted, either by holding a * ref on task or by holding tasklist_lock to prevent it from being unlinked. */ const struct cred *get_task_cred(struct task_struct *task) { const struct cred *cred; rcu_read_lock(); do { cred = __task_cred((task)); BUG_ON(!cred); } while (!get_cred_rcu(cred)); rcu_read_unlock(); return cred; } EXPORT_SYMBOL(get_task_cred); /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. */ struct cred *cred_alloc_blank(void) { struct cred *new; new = kmem_cache_zalloc(cred_jar, GFP_KERNEL); if (!new) return NULL; atomic_long_set(&new->usage, 1); #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; #endif if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; error: abort_creds(new); return NULL; } /** * prepare_creds - Prepare a new set of credentials for modification * * Prepare a new set of task credentials for modification. A task's creds * shouldn't generally be modified directly, therefore this function is used to * prepare a new copy, which the caller then modifies and then commits by * calling commit_creds(). * * Preparation involves making a copy of the objective creds for modification. * * Returns a pointer to the new creds-to-be if successful, NULL otherwise. * * Call commit_creds() or abort_creds() to clean up. */ struct cred *prepare_creds(void) { struct task_struct *task = current; const struct cred *old; struct cred *new; validate_process_creds(); new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_creds() alloc %p", new); old = task->cred; memcpy(new, old, sizeof(struct cred)); new->non_rcu = 0; atomic_long_set(&new->usage, 1); set_cred_subscribers(new, 0); get_group_info(new->group_info); get_uid(new->user); get_user_ns(new->user_ns); #ifdef CONFIG_KEYS key_get(new->session_keyring); key_get(new->process_keyring); key_get(new->thread_keyring); key_get(new->request_key_auth); #endif #ifdef CONFIG_SECURITY new->security = NULL; #endif if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; validate_creds(new); return new; error: abort_creds(new); return NULL; } EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() * - The caller must hold ->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { struct cred *new; new = prepare_creds(); if (!new) return new; #ifdef CONFIG_KEYS /* newly exec'd tasks don't get a thread keyring */ key_put(new->thread_keyring); new->thread_keyring = NULL; /* inherit the session keyring; new process keyring */ key_put(new->process_keyring); new->process_keyring = NULL; #endif new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; return new; } /* * Copy credentials for the new process created by fork() * * We share if we can, but under some circumstances we have to generate a new * set. * * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { struct cred *new; int ret; #ifdef CONFIG_KEYS_REQUEST_CACHE p->cached_requested_key = NULL; #endif if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && #endif clone_flags & CLONE_THREAD ) { p->real_cred = get_cred(p->cred); get_cred(p->cred); alter_cred_subscribers(p->cred, 2); kdebug("share_creds(%p{%ld,%d})", p->cred, atomic_long_read(&p->cred->usage), read_cred_subscribers(p->cred)); atomic_inc(&p->cred->user->processes); return 0; } new = prepare_creds(); if (!new) return -ENOMEM; if (clone_flags & CLONE_NEWUSER) { ret = create_user_ns(new); if (ret < 0) goto error_put; } #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ if (new->thread_keyring) { key_put(new->thread_keyring); new->thread_keyring = NULL; if (clone_flags & CLONE_THREAD) install_thread_keyring_to_cred(new); } /* The process keyring is only shared between the threads in a process; * anything outside of those threads doesn't inherit. */ if (!(clone_flags & CLONE_THREAD)) { key_put(new->process_keyring); new->process_keyring = NULL; } #endif atomic_inc(&new->user->processes); p->cred = p->real_cred = get_cred(new); alter_cred_subscribers(new, 2); validate_creds(new); return 0; error_put: put_cred(new); return ret; } static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) { const struct user_namespace *set_ns = set->user_ns; const struct user_namespace *subset_ns = subset->user_ns; /* If the two credentials are in the same user namespace see if * the capabilities of subset are a subset of set. */ if (set_ns == subset_ns) return cap_issubset(subset->cap_permitted, set->cap_permitted); /* The credentials are in a different user namespaces * therefore one is a subset of the other only if a set is an * ancestor of subset and set->euid is owner of subset or one * of subsets ancestors. */ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) { if ((set_ns == subset_ns->parent) && uid_eq(subset_ns->owner, set->euid)) return true; } return false; } /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned * * Install a new set of credentials to the current task, using RCU to replace * the old set. Both the objective and the subjective credentials pointers are * updated. This function may not be called if the subjective credentials are * in an overridden state. * * This function eats the caller's reference to the new credentials. * * Always returns 0 thus allowing this function to be tail-called at the end * of, say, sys_setgid(). */ int commit_creds(struct cred *new) { struct task_struct *task = current; const struct cred *old = task->real_cred; kdebug("commit_creds(%p{%ld,%d})", new, atomic_long_read(&new->usage), read_cred_subscribers(new)); BUG_ON(task->cred != old); #ifdef CONFIG_DEBUG_CREDENTIALS BUG_ON(read_cred_subscribers(old) < 2); validate_creds(old); validate_creds(new); #endif BUG_ON(atomic_long_read(&new->usage) < 1); get_cred(new); /* we will require a ref for the subj creds too */ /* dumpability changes */ if (!uid_eq(old->euid, new->euid) || !gid_eq(old->egid, new->egid) || !uid_eq(old->fsuid, new->fsuid) || !gid_eq(old->fsgid, new->fsgid) || !cred_cap_issubset(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; /* * If a task drops privileges and becomes nondumpable, * the dumpability change must become visible before * the credential change; otherwise, a __ptrace_may_access() * racing with this change may be able to attach to a task it * shouldn't be able to attach to (as if the task had dropped * privileges without becoming nondumpable). * Pairs with a read barrier in __ptrace_may_access(). */ smp_wmb(); } /* alter the thread keyring */ if (!uid_eq(new->fsuid, old->fsuid)) key_fsuid_changed(new); if (!gid_eq(new->fsgid, old->fsgid)) key_fsgid_changed(new); /* do it * RLIMIT_NPROC limits on user->processes have already been checked * in set_user(). */ alter_cred_subscribers(new, 2); if (new->user != old->user) atomic_inc(&new->user->processes); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user) atomic_dec(&old->user->processes); alter_cred_subscribers(old, -2); /* send notifications */ if (!uid_eq(new->uid, old->uid) || !uid_eq(new->euid, old->euid) || !uid_eq(new->suid, old->suid) || !uid_eq(new->fsuid, old->fsuid)) proc_id_connector(task, PROC_EVENT_UID); if (!gid_eq(new->gid, old->gid) || !gid_eq(new->egid, old->egid) || !gid_eq(new->sgid, old->sgid) || !gid_eq(new->fsgid, old->fsgid)) proc_id_connector(task, PROC_EVENT_GID); /* release the old obj and subj refs both */ put_cred(old); put_cred(old); return 0; } EXPORT_SYMBOL(commit_creds); /** * abort_creds - Discard a set of credentials and unlock the current task * @new: The credentials that were going to be applied * * Discard a set of credentials that were under construction and unlock the * current task. */ void abort_creds(struct cred *new) { kdebug("abort_creds(%p{%ld,%d})", new, atomic_long_read(&new->usage), read_cred_subscribers(new)); #ifdef CONFIG_DEBUG_CREDENTIALS BUG_ON(read_cred_subscribers(new) != 0); #endif BUG_ON(atomic_long_read(&new->usage) < 1); put_cred(new); } EXPORT_SYMBOL(abort_creds); /** * override_creds - Override the current process's subjective credentials * @new: The credentials to be assigned * * Install a set of temporary override subjective credentials on the current * process, returning the old set for later reversion. */ const struct cred *override_creds(const struct cred *new) { const struct cred *old = current->cred; kdebug("override_creds(%p{%ld,%d})", new, atomic_long_read(&new->usage), read_cred_subscribers(new)); validate_creds(old); validate_creds(new); /* * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. * * That means that we do not clear the 'non_rcu' flag, since * we are only installing the cred into the thread-synchronous * '->cred' pointer, not the '->real_cred' pointer that is * visible to other threads under RCU. * * Also note that we did validate_creds() manually, not depending * on the validation in 'get_cred()'. */ get_new_cred((struct cred *)new); alter_cred_subscribers(new, 1); rcu_assign_pointer(current->cred, new); alter_cred_subscribers(old, -1); kdebug("override_creds() = %p{%ld,%d}", old, atomic_long_read(&old->usage), read_cred_subscribers(old)); return old; } EXPORT_SYMBOL(override_creds); /** * revert_creds - Revert a temporary subjective credentials override * @old: The credentials to be restored * * Revert a temporary set of override subjective credentials to an old set, * discarding the override set. */ void revert_creds(const struct cred *old) { const struct cred *override = current->cred; kdebug("revert_creds(%p{%ld,%d})", old, atomic_long_read(&old->usage), read_cred_subscribers(old)); validate_creds(old); validate_creds(override); alter_cred_subscribers(old, 1); rcu_assign_pointer(current->cred, old); alter_cred_subscribers(override, -1); put_cred(override); } EXPORT_SYMBOL(revert_creds); /** * cred_fscmp - Compare two credentials with respect to filesystem access. * @a: The first credential * @b: The second credential * * cred_cmp() will return zero if both credentials have the same * fsuid, fsgid, and supplementary groups. That is, if they will both * provide the same access to files based on mode/uid/gid. * If the credentials are different, then either -1 or 1 will * be returned depending on whether @a comes before or after @b * respectively in an arbitrary, but stable, ordering of credentials. * * Return: -1, 0, or 1 depending on comparison */ int cred_fscmp(const struct cred *a, const struct cred *b) { struct group_info *ga, *gb; int g; if (a == b) return 0; if (uid_lt(a->fsuid, b->fsuid)) return -1; if (uid_gt(a->fsuid, b->fsuid)) return 1; if (gid_lt(a->fsgid, b->fsgid)) return -1; if (gid_gt(a->fsgid, b->fsgid)) return 1; ga = a->group_info; gb = b->group_info; if (ga == gb) return 0; if (ga == NULL) return -1; if (gb == NULL) return 1; if (ga->ngroups < gb->ngroups) return -1; if (ga->ngroups > gb->ngroups) return 1; for (g = 0; g < ga->ngroups; g++) { if (gid_lt(ga->gid[g], gb->gid[g])) return -1; if (gid_gt(ga->gid[g], gb->gid[g])) return 1; } return 0; } EXPORT_SYMBOL(cred_fscmp); /* * initialise the credentials stuff */ void __init cred_init(void) { /* allocate a slab in which we can store credentials */ cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); } /** * prepare_kernel_cred - Prepare a set of credentials for a kernel service * @daemon: A userspace daemon to be used as a reference * * Prepare a set of credentials for a kernel service. This can then be used to * override a task's own credentials so that work can be done on behalf of that * task that requires a different subjective context. * * @daemon is used to provide a base for the security record, but can be NULL. * If @daemon is supplied, then the security data will be derived from that; * otherwise they'll be set to 0 and no groups, full capabilities and no keys. * * The caller may change these controls afterwards if desired. * * Returns the new credentials or NULL if out of memory. */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { const struct cred *old; struct cred *new; new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_kernel_cred() alloc %p", new); if (daemon) old = get_task_cred(daemon); else old = get_cred(&init_cred); validate_creds(old); *new = *old; new->non_rcu = 0; atomic_long_set(&new->usage, 1); set_cred_subscribers(new, 0); get_uid(new->user); get_user_ns(new->user_ns); get_group_info(new->group_info); #ifdef CONFIG_KEYS new->session_keyring = NULL; new->process_keyring = NULL; new->thread_keyring = NULL; new->request_key_auth = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; #endif #ifdef CONFIG_SECURITY new->security = NULL; #endif if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; put_cred(old); validate_creds(new); return new; error: put_cred(new); put_cred(old); return NULL; } EXPORT_SYMBOL(prepare_kernel_cred); /** * set_security_override - Set the security ID in a set of credentials * @new: The credentials to alter * @secid: The LSM security ID to set * * Set the LSM security ID in a set of credentials so that the subjective * security is overridden when an alternative set of credentials is used. */ int set_security_override(struct cred *new, u32 secid) { return security_kernel_act_as(new, secid); } EXPORT_SYMBOL(set_security_override); /** * set_security_override_from_ctx - Set the security ID in a set of credentials * @new: The credentials to alter * @secctx: The LSM security context to generate the security ID from. * * Set the LSM security ID in a set of credentials so that the subjective * security is overridden when an alternative set of credentials is used. The * security ID is specified in string form as a security context to be * interpreted by the LSM. */ int set_security_override_from_ctx(struct cred *new, const char *secctx) { u32 secid; int ret; ret = security_secctx_to_secid(secctx, strlen(secctx), &secid); if (ret < 0) return ret; return set_security_override(new, secid); } EXPORT_SYMBOL(set_security_override_from_ctx); /** * set_create_files_as - Set the LSM file create context in a set of credentials * @new: The credentials to alter * @inode: The inode to take the context from * * Change the LSM file creation context in a set of credentials to be the same * as the object context of the specified inode, so that the new inodes have * the same MAC context as that inode. */ int set_create_files_as(struct cred *new, struct inode *inode) { if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) return -EINVAL; new->fsuid = inode->i_uid; new->fsgid = inode->i_gid; return security_kernel_create_files_as(new, inode); } EXPORT_SYMBOL(set_create_files_as); #ifdef CONFIG_DEBUG_CREDENTIALS bool creds_are_invalid(const struct cred *cred) { if (cred->magic != CRED_MAGIC) return true; return false; } EXPORT_SYMBOL(creds_are_invalid); /* * dump invalid credentials */ static void dump_invalid_creds(const struct cred *cred, const char *label, const struct task_struct *tsk) { printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n", label, cred, cred == &init_cred ? "[init]" : "", cred == tsk->real_cred ? "[real]" : "", cred == tsk->cred ? "[eff]" : ""); printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n", cred->magic, cred->put_addr); printk(KERN_ERR "CRED: ->usage=%ld, subscr=%d\n", atomic_long_read(&cred->usage), read_cred_subscribers(cred)); printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", from_kuid_munged(&init_user_ns, cred->uid), from_kuid_munged(&init_user_ns, cred->euid), from_kuid_munged(&init_user_ns, cred->suid), from_kuid_munged(&init_user_ns, cred->fsuid)); printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", from_kgid_munged(&init_user_ns, cred->gid), from_kgid_munged(&init_user_ns, cred->egid), from_kgid_munged(&init_user_ns, cred->sgid), from_kgid_munged(&init_user_ns, cred->fsgid)); #ifdef CONFIG_SECURITY printk(KERN_ERR "CRED: ->security is %p\n", cred->security); if ((unsigned long) cred->security >= PAGE_SIZE && (((unsigned long) cred->security & 0xffffff00) != (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))) printk(KERN_ERR "CRED: ->security {%x, %x}\n", ((u32*)cred->security)[0], ((u32*)cred->security)[1]); #endif } /* * report use of invalid credentials */ void __invalid_creds(const struct cred *cred, const char *file, unsigned line) { printk(KERN_ERR "CRED: Invalid credentials\n"); printk(KERN_ERR "CRED: At %s:%u\n", file, line); dump_invalid_creds(cred, "Specified", current); BUG(); } EXPORT_SYMBOL(__invalid_creds); /* * check the credentials on a process */ void __validate_process_creds(struct task_struct *tsk, const char *file, unsigned line) { if (tsk->cred == tsk->real_cred) { if (unlikely(read_cred_subscribers(tsk->cred) < 2 || creds_are_invalid(tsk->cred))) goto invalid_creds; } else { if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 || read_cred_subscribers(tsk->cred) < 1 || creds_are_invalid(tsk->real_cred) || creds_are_invalid(tsk->cred))) goto invalid_creds; } return; invalid_creds: printk(KERN_ERR "CRED: Invalid process credentials\n"); printk(KERN_ERR "CRED: At %s:%u\n", file, line); dump_invalid_creds(tsk->real_cred, "Real", tsk); if (tsk->cred != tsk->real_cred) dump_invalid_creds(tsk->cred, "Effective", tsk); else printk(KERN_ERR "CRED: Effective creds == Real creds\n"); BUG(); } EXPORT_SYMBOL(__validate_process_creds); /* * check creds for do_exit() */ void validate_creds_for_do_exit(struct task_struct *tsk) { kdebug("validate_creds_for_do_exit(%p,%p{%ld,%d})", tsk->real_cred, tsk->cred, atomic_long_read(&tsk->cred->usage), read_cred_subscribers(tsk->cred)); __validate_process_creds(tsk, __FILE__, __LINE__); } #endif /* CONFIG_DEBUG_CREDENTIALS */
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 * x86-64 work by Andi Kleen 2002 */ #ifndef _ASM_X86_FPU_INTERNAL_H #define _ASM_X86_FPU_INTERNAL_H #include <linux/compat.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/mm.h> #include <asm/user.h> #include <asm/fpu/api.h> #include <asm/fpu/xstate.h> #include <asm/fpu/xcr.h> #include <asm/cpufeature.h> #include <asm/trace/fpu.h> /* * High level FPU state handling functions: */ extern void fpu__prepare_read(struct fpu *fpu); extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern int fpu__copy(struct task_struct *dst, struct task_struct *src); extern void fpu__clear_user_states(struct fpu *fpu); extern void fpu__clear_all(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); /* * Boot time FPU initialization functions: */ extern void fpu__init_cpu(void); extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(void); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); extern u64 fpu__get_supported_xfeatures_mask(void); /* * Debugging facility: */ #ifdef CONFIG_X86_DEBUG_FPU # define WARN_ON_FPU(x) WARN_ON_ONCE(x) #else # define WARN_ON_FPU(x) ({ (void)(x); 0; }) #endif /* * FPU related CPU feature flag helper routines: */ static __always_inline __pure bool use_xsaveopt(void) { return static_cpu_has(X86_FEATURE_XSAVEOPT); } static __always_inline __pure bool use_xsave(void) { return static_cpu_has(X86_FEATURE_XSAVE); } static __always_inline __pure bool use_fxsr(void) { return static_cpu_has(X86_FEATURE_FXSR); } /* * fpstate handling functions: */ extern union fpregs_state init_fpstate; extern void fpstate_init(union fpregs_state *state); #ifdef CONFIG_MATH_EMULATION extern void fpstate_init_soft(struct swregs_state *soft); #else static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif static inline void fpstate_init_xstate(struct xregs_state *xsave) { /* * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all; } static inline void fpstate_init_fxstate(struct fxregs_state *fx) { fx->cwd = 0x37f; fx->mxcsr = MXCSR_DEFAULT; } extern void fpstate_sanitize_xstate(struct fpu *fpu); /* Returns 0 or the negated trap number, which results in -EFAULT for #PF */ #define user_insn(insn, output, input...) \ ({ \ int err; \ \ might_fault(); \ \ asm volatile(ASM_STAC "\n" \ "1: " #insn "\n" \ "2: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ "3: negl %%eax\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE_FAULT(1b, 3b) \ : [err] "=a" (err), output \ : "0"(0), input); \ err; \ }) #define kernel_insn_err(insn, output, input...) \ ({ \ int err; \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: movl $-1,%[err]\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE(1b, 3b) \ : [err] "=r" (err), output \ : "0"(0), input); \ err; \ }) #define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \ : output : input) static inline int copy_fregs_to_user(struct fregs_state __user *fx) { return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); else return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); } static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void copy_kernel_to_fregs(struct fregs_state *fx) { kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int copy_kernel_to_fregs_err(struct fregs_state *fx) { return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int copy_user_to_fregs(struct fregs_state __user *fx) { return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void copy_fxregs_to_kernel(struct fpu *fpu) { if (IS_ENABLED(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave)); else asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); } static inline void fxsave(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); else asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); } /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" #define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" /* * After this @err contains 0 on success or the negated trap number when * the operation raises an exception. For faults this results in -EFAULT. */ #define XSTATE_OP(op, st, lmask, hmask, err) \ asm volatile("1:" op "\n\t" \ "xor %[err], %[err]\n" \ "2:\n\t" \ ".pushsection .fixup,\"ax\"\n\t" \ "3: negl %%eax\n\t" \ "jmp 2b\n\t" \ ".popsection\n\t" \ _ASM_EXTABLE_FAULT(1b, 3b) \ : [err] "=a" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") /* * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact * format and supervisor states in addition to modified optimization in * XSAVEOPT. * * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. * * We use XSAVE as a fallback. * * The 661 label is defined in the ALTERNATIVE* macros as the address of the * original instruction which gets replaced. We need to use it here as the * address of the instruction where we might get an exception at. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ asm volatile(ALTERNATIVE_2(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ XSAVES, X86_FEATURE_XSAVES) \ "\n" \ "xor %[err], %[err]\n" \ "3:\n" \ ".pushsection .fixup,\"ax\"\n" \ "4: movl $-2, %[err]\n" \ "jmp 3b\n" \ ".popsection\n" \ _ASM_EXTABLE(661b, 4b) \ : [err] "=r" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") /* * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact * XSAVE area format. */ #define XSTATE_XRESTORE(st, lmask, hmask) \ asm volatile(ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ "3:\n" \ _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\ : \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") /* * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) { u64 mask = -1; u32 lmask = mask; u32 hmask = mask >> 32; int err; WARN_ON(system_state != SYSTEM_BOOTING); if (boot_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); /* * We should never fault when copying from a kernel buffer, and the FPU * state we set at boot time should be valid. */ WARN_ON_FPU(err); } /* * Save processor xstate to xsave area. */ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) { u64 mask = xfeatures_mask_all; u32 lmask = mask; u32 hmask = mask >> 32; int err; WARN_ON_FPU(!alternatives_patched); XSTATE_XSAVE(xstate, lmask, hmask, err); /* We should never fault when copying to a kernel buffer: */ WARN_ON_FPU(err); } /* * Restore processor xstate from xsave area. */ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; XSTATE_XRESTORE(xstate, lmask, hmask); } /* * Save xstate to user space xsave area. * * We don't use modified optimization because xrstor/xrstors might track * a different application. * * We don't use compacted format xsave area for * backward compatibility for old applications which don't understand * compacted format of xsave area. */ static inline int copy_xregs_to_user(struct xregs_state __user *buf) { u64 mask = xfeatures_mask_user(); u32 lmask = mask; u32 hmask = mask >> 32; int err; /* * Clear the xsave header first, so that reserved fields are * initialized to zero. */ err = __clear_user(&buf->header, sizeof(buf->header)); if (unlikely(err)) return -EFAULT; stac(); XSTATE_OP(XSAVE, buf, lmask, hmask, err); clac(); return err; } /* * Restore xstate from user space xsave area. */ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) { struct xregs_state *xstate = ((__force struct xregs_state *)buf); u32 lmask = mask; u32 hmask = mask >> 32; int err; stac(); XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); clac(); return err; } /* * Restore xstate from kernel space xsave area, return an error code instead of * an exception. */ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; int err; if (static_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); return err; } extern int copy_fpregs_to_fpstate(struct fpu *fpu); static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) { if (use_xsave()) { copy_kernel_to_xregs(&fpstate->xsave, mask); } else { if (use_fxsr()) copy_kernel_to_fxregs(&fpstate->fxsave); else copy_kernel_to_fregs(&fpstate->fsave); } } static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) { /* * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is * pending. Clear the x87 state here by setting it to fixed values. * "m" is a random variable that should be in L1. */ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" "fildl %P[addr]" /* set F?P to defined value */ : : [addr] "m" (fpstate)); } __copy_kernel_to_fpregs(fpstate, -1); } extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); /* * FPU context switch related helper methods: */ DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* * The in-register FPU state for an FPU context on a CPU is assumed to be * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx * matches the FPU. * * If the FPU register state is valid, the kernel can skip restoring the * FPU state from memory. * * Any code that clobbers the FPU registers or updates the in-memory * FPU state for a task MUST let the rest of the kernel know that the * FPU registers are no longer valid for this task. * * Either one of these invalidation functions is enough. Invalidate * a resource you control: CPU if using the CPU for something else * (with preemption disabled), FPU for the current task, or a task that * is prevented from running by the current task. */ static inline void __cpu_invalidate_fpregs_state(void) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { fpu->last_cpu = -1; } static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } /* * These generally need preemption protection to work, * do try to avoid using these on their own: */ static inline void fpregs_deactivate(struct fpu *fpu) { this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } /* * Internal helper, do not use directly. Use switch_fpu_return() instead. */ static inline void __fpregs_load_activate(void) { struct fpu *fpu = &current->thread.fpu; int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) return; if (!fpregs_state_valid(fpu, cpu)) { copy_kernel_to_fpregs(&fpu->state); fpregs_activate(fpu); fpu->last_cpu = cpu; } clear_thread_flag(TIF_NEED_FPU_LOAD); } /* * FPU state switching for scheduling. * * This is a two-stage process: * * - switch_fpu_prepare() saves the old state. * This is done within the context of the old process. * * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state * will get loaded on return to userspace, or when the kernel needs it. * * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers * are saved in the current thread's FPU register state. * * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not * hold current()'s FPU registers. It is required to load the * registers before returning to userland or using the content * otherwise. * * The FPU context is only stored/restored for a user task and * PF_KTHREAD is used to distinguish between kernel and user threads. */ static inline void switch_fpu_prepare(struct task_struct *prev, int cpu) { struct fpu *old_fpu = &prev->thread.fpu; if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else old_fpu->last_cpu = cpu; /* But leave fpu_fpregs_owner_ctx! */ trace_x86_fpu_regs_deactivated(old_fpu); } } /* * Misc helper functions: */ /* * Load PKRU from the FPU context if available. Delay loading of the * complete FPU state until the return to userland. */ static inline void switch_fpu_finish(struct task_struct *next) { u32 pkru_val = init_pkru_value; struct pkru_state *pk; struct fpu *next_fpu = &next->thread.fpu; if (!static_cpu_has(X86_FEATURE_FPU)) return; set_thread_flag(TIF_NEED_FPU_LOAD); if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; /* * PKRU state is switched eagerly because it needs to be valid before we * return to userland e.g. for a copy_to_user() operation. */ if (!(next->flags & PF_KTHREAD)) { /* * If the PKRU bit in xsave.header.xfeatures is not set, * then the PKRU component was in init state, which means * XRSTOR will set PKRU to 0. If the bit is not set then * get_xsave_addr() will return NULL because the PKRU value * in memory is not valid. This means pkru_val has to be * set to 0 and not to init_pkru_value. */ pk = get_xsave_addr(&next_fpu->state.xsave, XFEATURE_PKRU); pkru_val = pk ? pk->pkru : 0; } __write_pkru(pkru_val); } #endif /* _ASM_X86_FPU_INTERNAL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LOCAL_LOCK_H # error "Do not include directly, include linux/local_lock.h" #endif #include <linux/percpu-defs.h> #include <linux/lockdep.h> typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; struct task_struct *owner; #endif } local_lock_t; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define LOCAL_LOCK_DEBUG_INIT(lockname) \ .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_CONFIG, \ .lock_type = LD_LOCK_PERCPU, \ }, \ .owner = NULL, static inline void local_lock_acquire(local_lock_t *l) { lock_map_acquire(&l->dep_map); DEBUG_LOCKS_WARN_ON(l->owner); l->owner = current; } static inline void local_lock_release(local_lock_t *l) { DEBUG_LOCKS_WARN_ON(l->owner != current); l->owner = NULL; lock_map_release(&l->dep_map); } static inline void local_lock_debug_init(local_lock_t *l) { l->owner = NULL; } #else /* CONFIG_DEBUG_LOCK_ALLOC */ # define LOCAL_LOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { } static inline void local_lock_release(local_lock_t *l) { } static inline void local_lock_debug_init(local_lock_t *l) { } #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } #define __local_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \ 0, LD_WAIT_CONFIG, LD_WAIT_INV, \ LD_LOCK_PERCPU); \ local_lock_debug_init(lock); \ } while (0) #define __local_lock(lock) \ do { \ preempt_disable(); \ local_lock_acquire(this_cpu_ptr(lock)); \ } while (0) #define __local_lock_irq(lock) \ do { \ local_irq_disable(); \ local_lock_acquire(this_cpu_ptr(lock)); \ } while (0) #define __local_lock_irqsave(lock, flags) \ do { \ local_irq_save(flags); \ local_lock_acquire(this_cpu_ptr(lock)); \ } while (0) #define __local_unlock(lock) \ do { \ local_lock_release(this_cpu_ptr(lock)); \ preempt_enable(); \ } while (0) #define __local_unlock_irq(lock) \ do { \ local_lock_release(this_cpu_ptr(lock)); \ local_irq_enable(); \ } while (0) #define __local_unlock_irqrestore(lock, flags) \ do { \ local_lock_release(this_cpu_ptr(lock)); \ local_irq_restore(flags); \ } while (0)
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 /* SPDX-License-Identifier: GPL-2.0 */ /* * Wrapper functions for accessing the file_struct fd array. */ #ifndef __LINUX_FILE_H #define __LINUX_FILE_H #include <linux/compiler.h> #include <linux/types.h> #include <linux/posix_types.h> #include <linux/errno.h> struct file; extern void fput(struct file *); extern void fput_many(struct file *, unsigned int); struct file_operations; struct task_struct; struct vfsmount; struct dentry; struct inode; struct path; extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); extern struct file *alloc_file_clone(struct file *, int flags, const struct file_operations *); static inline void fput_light(struct file *file, int fput_needed) { if (fput_needed) fput(file); } struct fd { struct file *file; unsigned int flags; }; #define FDPUT_FPUT 1 #define FDPUT_POS_UNLOCK 2 static inline void fdput(struct fd fd) { if (fd.flags & FDPUT_FPUT) fput(fd.file); } extern struct file *fget(unsigned int fd); extern struct file *fget_many(unsigned int fd, unsigned int refs); extern struct file *fget_raw(unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd); extern unsigned long __fdget(unsigned int fd); extern unsigned long __fdget_raw(unsigned int fd); extern unsigned long __fdget_pos(unsigned int fd); extern void __f_unlock_pos(struct file *); static inline struct fd __to_fd(unsigned long v) { return (struct fd){(struct file *)(v & ~3),v & 3}; } static inline struct fd fdget(unsigned int fd) { return __to_fd(__fdget(fd)); } static inline struct fd fdget_raw(unsigned int fd) { return __to_fd(__fdget_raw(fd)); } static inline struct fd fdget_pos(int fd) { return __to_fd(__fdget_pos(fd)); } static inline void fdput_pos(struct fd f) { if (f.flags & FDPUT_POS_UNLOCK) __f_unlock_pos(f.file); fdput(f); } extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); extern int replace_fd(unsigned fd, struct file *file, unsigned flags); extern void set_close_on_exec(unsigned int fd, int flag); extern bool get_close_on_exec(unsigned int fd); extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile); extern int get_unused_fd_flags(unsigned flags); extern void put_unused_fd(unsigned int fd); extern void fd_install(unsigned int fd, struct file *file); extern int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags); static inline int receive_fd_user(struct file *file, int __user *ufd, unsigned int o_flags) { if (ufd == NULL) return -EFAULT; return __receive_fd(-1, file, ufd, o_flags); } static inline int receive_fd(struct file *file, unsigned int o_flags) { return __receive_fd(-1, file, NULL, o_flags); } static inline int receive_fd_replace(int fd, struct file *file, unsigned int o_flags) { return __receive_fd(fd, file, NULL, o_flags); } extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; #endif /* __LINUX_FILE_H */
1 1 1 3 3 3 3 3 1 1 1 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 // SPDX-License-Identifier: GPL-2.0-only /* * Implementation of the kernel access vector cache (AVC). * * Authors: Stephen Smalley, <sds@tycho.nsa.gov> * James Morris <jmorris@redhat.com> * * Update: KaiGai, Kohei <kaigai@ak.jp.nec.com> * Replaced the avc_lock spinlock by RCU. * * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/types.h> #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/dcache.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/percpu.h> #include <linux/list.h> #include <net/sock.h> #include <linux/un.h> #include <net/af_unix.h> #include <linux/ip.h> #include <linux/audit.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include "avc.h" #include "avc_ss.h" #include "classmap.h" #define CREATE_TRACE_POINTS #include <trace/events/avc.h> #define AVC_CACHE_SLOTS 512 #define AVC_DEF_CACHE_THRESHOLD 512 #define AVC_CACHE_RECLAIM 16 #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS #define avc_cache_stats_incr(field) this_cpu_inc(avc_cache_stats.field) #else #define avc_cache_stats_incr(field) do {} while (0) #endif struct avc_entry { u32 ssid; u32 tsid; u16 tclass; struct av_decision avd; struct avc_xperms_node *xp_node; }; struct avc_node { struct avc_entry ae; struct hlist_node list; /* anchored in avc_cache->slots[i] */ struct rcu_head rhead; }; struct avc_xperms_decision_node { struct extended_perms_decision xpd; struct list_head xpd_list; /* list of extended_perms_decision */ }; struct avc_xperms_node { struct extended_perms xp; struct list_head xpd_head; /* list head of extended_perms_decision */ }; struct avc_cache { struct hlist_head slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */ spinlock_t slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */ atomic_t lru_hint; /* LRU hint for reclaim scan */ atomic_t active_nodes; u32 latest_notif; /* latest revocation notification */ }; struct avc_callback_node { int (*callback) (u32 event); u32 events; struct avc_callback_node *next; }; #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 }; #endif struct selinux_avc { unsigned int avc_cache_threshold; struct avc_cache avc_cache; }; static struct selinux_avc selinux_avc; void selinux_avc_init(struct selinux_avc **avc) { int i; selinux_avc.avc_cache_threshold = AVC_DEF_CACHE_THRESHOLD; for (i = 0; i < AVC_CACHE_SLOTS; i++) { INIT_HLIST_HEAD(&selinux_avc.avc_cache.slots[i]); spin_lock_init(&selinux_avc.avc_cache.slots_lock[i]); } atomic_set(&selinux_avc.avc_cache.active_nodes, 0); atomic_set(&selinux_avc.avc_cache.lru_hint, 0); *avc = &selinux_avc; } unsigned int avc_get_cache_threshold(struct selinux_avc *avc) { return avc->avc_cache_threshold; } void avc_set_cache_threshold(struct selinux_avc *avc, unsigned int cache_threshold) { avc->avc_cache_threshold = cache_threshold; } static struct avc_callback_node *avc_callbacks; static struct kmem_cache *avc_node_cachep; static struct kmem_cache *avc_xperms_data_cachep; static struct kmem_cache *avc_xperms_decision_cachep; static struct kmem_cache *avc_xperms_cachep; static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass) { return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1); } /** * avc_init - Initialize the AVC. * * Initialize the access vector cache. */ void __init avc_init(void) { avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node), 0, SLAB_PANIC, NULL); avc_xperms_cachep = kmem_cache_create("avc_xperms_node", sizeof(struct avc_xperms_node), 0, SLAB_PANIC, NULL); avc_xperms_decision_cachep = kmem_cache_create( "avc_xperms_decision_node", sizeof(struct avc_xperms_decision_node), 0, SLAB_PANIC, NULL); avc_xperms_data_cachep = kmem_cache_create("avc_xperms_data", sizeof(struct extended_perms_data), 0, SLAB_PANIC, NULL); } int avc_get_hash_stats(struct selinux_avc *avc, char *page) { int i, chain_len, max_chain_len, slots_used; struct avc_node *node; struct hlist_head *head; rcu_read_lock(); slots_used = 0; max_chain_len = 0; for (i = 0; i < AVC_CACHE_SLOTS; i++) { head = &avc->avc_cache.slots[i]; if (!hlist_empty(head)) { slots_used++; chain_len = 0; hlist_for_each_entry_rcu(node, head, list) chain_len++; if (chain_len > max_chain_len) max_chain_len = chain_len; } } rcu_read_unlock(); return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n" "longest chain: %d\n", atomic_read(&avc->avc_cache.active_nodes), slots_used, AVC_CACHE_SLOTS, max_chain_len); } /* * using a linked list for extended_perms_decision lookup because the list is * always small. i.e. less than 5, typically 1 */ static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver, struct avc_xperms_node *xp_node) { struct avc_xperms_decision_node *xpd_node; list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) { if (xpd_node->xpd.driver == driver) return &xpd_node->xpd; } return NULL; } static inline unsigned int avc_xperms_has_perm(struct extended_perms_decision *xpd, u8 perm, u8 which) { unsigned int rc = 0; if ((which == XPERMS_ALLOWED) && (xpd->used & XPERMS_ALLOWED)) rc = security_xperm_test(xpd->allowed->p, perm); else if ((which == XPERMS_AUDITALLOW) && (xpd->used & XPERMS_AUDITALLOW)) rc = security_xperm_test(xpd->auditallow->p, perm); else if ((which == XPERMS_DONTAUDIT) && (xpd->used & XPERMS_DONTAUDIT)) rc = security_xperm_test(xpd->dontaudit->p, perm); return rc; } static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node, u8 driver, u8 perm) { struct extended_perms_decision *xpd; security_xperm_set(xp_node->xp.drivers.p, driver); xpd = avc_xperms_decision_lookup(driver, xp_node); if (xpd && xpd->allowed) security_xperm_set(xpd->allowed->p, perm); } static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node) { struct extended_perms_decision *xpd; xpd = &xpd_node->xpd; if (xpd->allowed) kmem_cache_free(avc_xperms_data_cachep, xpd->allowed); if (xpd->auditallow) kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow); if (xpd->dontaudit) kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit); kmem_cache_free(avc_xperms_decision_cachep, xpd_node); } static void avc_xperms_free(struct avc_xperms_node *xp_node) { struct avc_xperms_decision_node *xpd_node, *tmp; if (!xp_node) return; list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) { list_del(&xpd_node->xpd_list); avc_xperms_decision_free(xpd_node); } kmem_cache_free(avc_xperms_cachep, xp_node); } static void avc_copy_xperms_decision(struct extended_perms_decision *dest, struct extended_perms_decision *src) { dest->driver = src->driver; dest->used = src->used; if (dest->used & XPERMS_ALLOWED) memcpy(dest->allowed->p, src->allowed->p, sizeof(src->allowed->p)); if (dest->used & XPERMS_AUDITALLOW) memcpy(dest->auditallow->p, src->auditallow->p, sizeof(src->auditallow->p)); if (dest->used & XPERMS_DONTAUDIT) memcpy(dest->dontaudit->p, src->dontaudit->p, sizeof(src->dontaudit->p)); } /* * similar to avc_copy_xperms_decision, but only copy decision * information relevant to this perm */ static inline void avc_quick_copy_xperms_decision(u8 perm, struct extended_perms_decision *dest, struct extended_perms_decision *src) { /* * compute index of the u32 of the 256 bits (8 u32s) that contain this * command permission */ u8 i = perm >> 5; dest->used = src->used; if (dest->used & XPERMS_ALLOWED) dest->allowed->p[i] = src->allowed->p[i]; if (dest->used & XPERMS_AUDITALLOW) dest->auditallow->p[i] = src->auditallow->p[i]; if (dest->used & XPERMS_DONTAUDIT) dest->dontaudit->p[i] = src->dontaudit->p[i]; } static struct avc_xperms_decision_node *avc_xperms_decision_alloc(u8 which) { struct avc_xperms_decision_node *xpd_node; struct extended_perms_decision *xpd; xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT | __GFP_NOWARN); if (!xpd_node) return NULL; xpd = &xpd_node->xpd; if (which & XPERMS_ALLOWED) { xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep, GFP_NOWAIT | __GFP_NOWARN); if (!xpd->allowed) goto error; } if (which & XPERMS_AUDITALLOW) { xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep, GFP_NOWAIT | __GFP_NOWARN); if (!xpd->auditallow) goto error; } if (which & XPERMS_DONTAUDIT) { xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep, GFP_NOWAIT | __GFP_NOWARN); if (!xpd->dontaudit) goto error; } return xpd_node; error: avc_xperms_decision_free(xpd_node); return NULL; } static int avc_add_xperms_decision(struct avc_node *node, struct extended_perms_decision *src) { struct avc_xperms_decision_node *dest_xpd; node->ae.xp_node->xp.len++; dest_xpd = avc_xperms_decision_alloc(src->used); if (!dest_xpd) return -ENOMEM; avc_copy_xperms_decision(&dest_xpd->xpd, src); list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head); return 0; } static struct avc_xperms_node *avc_xperms_alloc(void) { struct avc_xperms_node *xp_node; xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT | __GFP_NOWARN); if (!xp_node) return xp_node; INIT_LIST_HEAD(&xp_node->xpd_head); return xp_node; } static int avc_xperms_populate(struct avc_node *node, struct avc_xperms_node *src) { struct avc_xperms_node *dest; struct avc_xperms_decision_node *dest_xpd; struct avc_xperms_decision_node *src_xpd; if (src->xp.len == 0) return 0; dest = avc_xperms_alloc(); if (!dest) return -ENOMEM; memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p)); dest->xp.len = src->xp.len; /* for each source xpd allocate a destination xpd and copy */ list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) { dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used); if (!dest_xpd) goto error; avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd); list_add(&dest_xpd->xpd_list, &dest->xpd_head); } node->ae.xp_node = dest; return 0; error: avc_xperms_free(dest); return -ENOMEM; } static inline u32 avc_xperms_audit_required(u32 requested, struct av_decision *avd, struct extended_perms_decision *xpd, u8 perm, int result, u32 *deniedp) { u32 denied, audited; denied = requested & ~avd->allowed; if (unlikely(denied)) { audited = denied & avd->auditdeny; if (audited && xpd) { if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT)) audited &= ~requested; } } else if (result) { audited = denied = requested; } else { audited = requested & avd->auditallow; if (audited && xpd) { if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW)) audited &= ~requested; } } *deniedp = denied; return audited; } static inline int avc_xperms_audit(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *avd, struct extended_perms_decision *xpd, u8 perm, int result, struct common_audit_data *ad) { u32 audited, denied; audited = avc_xperms_audit_required( requested, avd, xpd, perm, result, &denied); if (likely(!audited)) return 0; return slow_avc_audit(state, ssid, tsid, tclass, requested, audited, denied, result, ad); } static void avc_node_free(struct rcu_head *rhead) { struct avc_node *node = container_of(rhead, struct avc_node, rhead); avc_xperms_free(node->ae.xp_node); kmem_cache_free(avc_node_cachep, node); avc_cache_stats_incr(frees); } static void avc_node_delete(struct selinux_avc *avc, struct avc_node *node) { hlist_del_rcu(&node->list); call_rcu(&node->rhead, avc_node_free); atomic_dec(&avc->avc_cache.active_nodes); } static void avc_node_kill(struct selinux_avc *avc, struct avc_node *node) { avc_xperms_free(node->ae.xp_node); kmem_cache_free(avc_node_cachep, node); avc_cache_stats_incr(frees); atomic_dec(&avc->avc_cache.active_nodes); } static void avc_node_replace(struct selinux_avc *avc, struct avc_node *new, struct avc_node *old) { hlist_replace_rcu(&old->list, &new->list); call_rcu(&old->rhead, avc_node_free); atomic_dec(&avc->avc_cache.active_nodes); } static inline int avc_reclaim_node(struct selinux_avc *avc) { struct avc_node *node; int hvalue, try, ecx; unsigned long flags; struct hlist_head *head; spinlock_t *lock; for (try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++) { hvalue = atomic_inc_return(&avc->avc_cache.lru_hint) & (AVC_CACHE_SLOTS - 1); head = &avc->avc_cache.slots[hvalue]; lock = &avc->avc_cache.slots_lock[hvalue]; if (!spin_trylock_irqsave(lock, flags)) continue; rcu_read_lock(); hlist_for_each_entry(node, head, list) { avc_node_delete(avc, node); avc_cache_stats_incr(reclaims); ecx++; if (ecx >= AVC_CACHE_RECLAIM) { rcu_read_unlock(); spin_unlock_irqrestore(lock, flags); goto out; } } rcu_read_unlock(); spin_unlock_irqrestore(lock, flags); } out: return ecx; } static struct avc_node *avc_alloc_node(struct selinux_avc *avc) { struct avc_node *node; node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT | __GFP_NOWARN); if (!node) goto out; INIT_HLIST_NODE(&node->list); avc_cache_stats_incr(allocations); if (atomic_inc_return(&avc->avc_cache.active_nodes) > avc->avc_cache_threshold) avc_reclaim_node(avc); out: return node; } static void avc_node_populate(struct avc_node *node, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd) { node->ae.ssid = ssid; node->ae.tsid = tsid; node->ae.tclass = tclass; memcpy(&node->ae.avd, avd, sizeof(node->ae.avd)); } static inline struct avc_node *avc_search_node(struct selinux_avc *avc, u32 ssid, u32 tsid, u16 tclass) { struct avc_node *node, *ret = NULL; int hvalue; struct hlist_head *head; hvalue = avc_hash(ssid, tsid, tclass); head = &avc->avc_cache.slots[hvalue]; hlist_for_each_entry_rcu(node, head, list) { if (ssid == node->ae.ssid && tclass == node->ae.tclass && tsid == node->ae.tsid) { ret = node; break; } } return ret; } /** * avc_lookup - Look up an AVC entry. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * * Look up an AVC entry that is valid for the * (@ssid, @tsid), interpreting the permissions * based on @tclass. If a valid AVC entry exists, * then this function returns the avc_node. * Otherwise, this function returns NULL. */ static struct avc_node *avc_lookup(struct selinux_avc *avc, u32 ssid, u32 tsid, u16 tclass) { struct avc_node *node; avc_cache_stats_incr(lookups); node = avc_search_node(avc, ssid, tsid, tclass); if (node) return node; avc_cache_stats_incr(misses); return NULL; } static int avc_latest_notif_update(struct selinux_avc *avc, int seqno, int is_insert) { int ret = 0; static DEFINE_SPINLOCK(notif_lock); unsigned long flag; spin_lock_irqsave(&notif_lock, flag); if (is_insert) { if (seqno < avc->avc_cache.latest_notif) { pr_warn("SELinux: avc: seqno %d < latest_notif %d\n", seqno, avc->avc_cache.latest_notif); ret = -EAGAIN; } } else { if (seqno > avc->avc_cache.latest_notif) avc->avc_cache.latest_notif = seqno; } spin_unlock_irqrestore(&notif_lock, flag); return ret; } /** * avc_insert - Insert an AVC entry. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @avd: resulting av decision * @xp_node: resulting extended permissions * * Insert an AVC entry for the SID pair * (@ssid, @tsid) and class @tclass. * The access vectors and the sequence number are * normally provided by the security server in * response to a security_compute_av() call. If the * sequence number @avd->seqno is not less than the latest * revocation notification, then the function copies * the access vectors into a cache entry, returns * avc_node inserted. Otherwise, this function returns NULL. */ static struct avc_node *avc_insert(struct selinux_avc *avc, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd, struct avc_xperms_node *xp_node) { struct avc_node *pos, *node = NULL; int hvalue; unsigned long flag; spinlock_t *lock; struct hlist_head *head; if (avc_latest_notif_update(avc, avd->seqno, 1)) return NULL; node = avc_alloc_node(avc); if (!node) return NULL; avc_node_populate(node, ssid, tsid, tclass, avd); if (avc_xperms_populate(node, xp_node)) { avc_node_kill(avc, node); return NULL; } hvalue = avc_hash(ssid, tsid, tclass); head = &avc->avc_cache.slots[hvalue]; lock = &avc->avc_cache.slots_lock[hvalue]; spin_lock_irqsave(lock, flag); hlist_for_each_entry(pos, head, list) { if (pos->ae.ssid == ssid && pos->ae.tsid == tsid && pos->ae.tclass == tclass) { avc_node_replace(avc, node, pos); goto found; } } hlist_add_head_rcu(&node->list, head); found: spin_unlock_irqrestore(lock, flag); return node; } /** * avc_audit_pre_callback - SELinux specific information * will be called by generic audit code * @ab: the audit buffer * @a: audit_data */ static void avc_audit_pre_callback(struct audit_buffer *ab, void *a) { struct common_audit_data *ad = a; struct selinux_audit_data *sad = ad->selinux_audit_data; u32 av = sad->audited; const char **perms; int i, perm; audit_log_format(ab, "avc: %s ", sad->denied ? "denied" : "granted"); if (av == 0) { audit_log_format(ab, " null"); return; } perms = secclass_map[sad->tclass-1].perms; audit_log_format(ab, " {"); i = 0; perm = 1; while (i < (sizeof(av) * 8)) { if ((perm & av) && perms[i]) { audit_log_format(ab, " %s", perms[i]); av &= ~perm; } i++; perm <<= 1; } if (av) audit_log_format(ab, " 0x%x", av); audit_log_format(ab, " } for "); } /** * avc_audit_post_callback - SELinux specific information * will be called by generic audit code * @ab: the audit buffer * @a: audit_data */ static void avc_audit_post_callback(struct audit_buffer *ab, void *a) { struct common_audit_data *ad = a; struct selinux_audit_data *sad = ad->selinux_audit_data; char *scontext = NULL; char *tcontext = NULL; const char *tclass = NULL; u32 scontext_len; u32 tcontext_len; int rc; rc = security_sid_to_context(sad->state, sad->ssid, &scontext, &scontext_len); if (rc) audit_log_format(ab, " ssid=%d", sad->ssid); else audit_log_format(ab, " scontext=%s", scontext); rc = security_sid_to_context(sad->state, sad->tsid, &tcontext, &tcontext_len); if (rc) audit_log_format(ab, " tsid=%d", sad->tsid); else audit_log_format(ab, " tcontext=%s", tcontext); tclass = secclass_map[sad->tclass-1].name; audit_log_format(ab, " tclass=%s", tclass); if (sad->denied) audit_log_format(ab, " permissive=%u", sad->result ? 0 : 1); trace_selinux_audited(sad, scontext, tcontext, tclass); kfree(tcontext); kfree(scontext); /* in case of invalid context report also the actual context string */ rc = security_sid_to_context_inval(sad->state, sad->ssid, &scontext, &scontext_len); if (!rc && scontext) { if (scontext_len && scontext[scontext_len - 1] == '\0') scontext_len--; audit_log_format(ab, " srawcon="); audit_log_n_untrustedstring(ab, scontext, scontext_len); kfree(scontext); } rc = security_sid_to_context_inval(sad->state, sad->tsid, &scontext, &scontext_len); if (!rc && scontext) { if (scontext_len && scontext[scontext_len - 1] == '\0') scontext_len--; audit_log_format(ab, " trawcon="); audit_log_n_untrustedstring(ab, scontext, scontext_len); kfree(scontext); } } /* This is the slow part of avc audit with big stack footprint */ noinline int slow_avc_audit(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, u32 audited, u32 denied, int result, struct common_audit_data *a) { struct common_audit_data stack_data; struct selinux_audit_data sad; if (WARN_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map))) return -EINVAL; if (!a) { a = &stack_data; a->type = LSM_AUDIT_DATA_NONE; } sad.tclass = tclass; sad.requested = requested; sad.ssid = ssid; sad.tsid = tsid; sad.audited = audited; sad.denied = denied; sad.result = result; sad.state = state; a->selinux_audit_data = &sad; common_lsm_audit(a, avc_audit_pre_callback, avc_audit_post_callback); return 0; } /** * avc_add_callback - Register a callback for security events. * @callback: callback function * @events: security events * * Register a callback function for events in the set @events. * Returns %0 on success or -%ENOMEM if insufficient memory * exists to add the callback. */ int __init avc_add_callback(int (*callback)(u32 event), u32 events) { struct avc_callback_node *c; int rc = 0; c = kmalloc(sizeof(*c), GFP_KERNEL); if (!c) { rc = -ENOMEM; goto out; } c->callback = callback; c->events = events; c->next = avc_callbacks; avc_callbacks = c; out: return rc; } /** * avc_update_node Update an AVC entry * @event : Updating event * @perms : Permission mask bits * @ssid,@tsid,@tclass : identifier of an AVC entry * @seqno : sequence number when decision was made * @xpd: extended_perms_decision to be added to the node * @flags: the AVC_* flags, e.g. AVC_NONBLOCKING, AVC_EXTENDED_PERMS, or 0. * * if a valid AVC entry doesn't exist,this function returns -ENOENT. * if kmalloc() called internal returns NULL, this function returns -ENOMEM. * otherwise, this function updates the AVC entry. The original AVC-entry object * will release later by RCU. */ static int avc_update_node(struct selinux_avc *avc, u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, u32 tsid, u16 tclass, u32 seqno, struct extended_perms_decision *xpd, u32 flags) { int hvalue, rc = 0; unsigned long flag; struct avc_node *pos, *node, *orig = NULL; struct hlist_head *head; spinlock_t *lock; /* * If we are in a non-blocking code path, e.g. VFS RCU walk, * then we must not add permissions to a cache entry * because we will not audit the denial. Otherwise, * during the subsequent blocking retry (e.g. VFS ref walk), we * will find the permissions already granted in the cache entry * and won't audit anything at all, leading to silent denials in * permissive mode that only appear when in enforcing mode. * * See the corresponding handling of MAY_NOT_BLOCK in avc_audit() * and selinux_inode_permission(). */ if (flags & AVC_NONBLOCKING) return 0; node = avc_alloc_node(avc); if (!node) { rc = -ENOMEM; goto out; } /* Lock the target slot */ hvalue = avc_hash(ssid, tsid, tclass); head = &avc->avc_cache.slots[hvalue]; lock = &avc->avc_cache.slots_lock[hvalue]; spin_lock_irqsave(lock, flag); hlist_for_each_entry(pos, head, list) { if (ssid == pos->ae.ssid && tsid == pos->ae.tsid && tclass == pos->ae.tclass && seqno == pos->ae.avd.seqno){ orig = pos; break; } } if (!orig) { rc = -ENOENT; avc_node_kill(avc, node); goto out_unlock; } /* * Copy and replace original node. */ avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd); if (orig->ae.xp_node) { rc = avc_xperms_populate(node, orig->ae.xp_node); if (rc) { avc_node_kill(avc, node); goto out_unlock; } } switch (event) { case AVC_CALLBACK_GRANT: node->ae.avd.allowed |= perms; if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS)) avc_xperms_allow_perm(node->ae.xp_node, driver, xperm); break; case AVC_CALLBACK_TRY_REVOKE: case AVC_CALLBACK_REVOKE: node->ae.avd.allowed &= ~perms; break; case AVC_CALLBACK_AUDITALLOW_ENABLE: node->ae.avd.auditallow |= perms; break; case AVC_CALLBACK_AUDITALLOW_DISABLE: node->ae.avd.auditallow &= ~perms; break; case AVC_CALLBACK_AUDITDENY_ENABLE: node->ae.avd.auditdeny |= perms; break; case AVC_CALLBACK_AUDITDENY_DISABLE: node->ae.avd.auditdeny &= ~perms; break; case AVC_CALLBACK_ADD_XPERMS: avc_add_xperms_decision(node, xpd); break; } avc_node_replace(avc, node, orig); out_unlock: spin_unlock_irqrestore(lock, flag); out: return rc; } /** * avc_flush - Flush the cache */ static void avc_flush(struct selinux_avc *avc) { struct hlist_head *head; struct avc_node *node; spinlock_t *lock; unsigned long flag; int i; for (i = 0; i < AVC_CACHE_SLOTS; i++) { head = &avc->avc_cache.slots[i]; lock = &avc->avc_cache.slots_lock[i]; spin_lock_irqsave(lock, flag); /* * With preemptable RCU, the outer spinlock does not * prevent RCU grace periods from ending. */ rcu_read_lock(); hlist_for_each_entry(node, head, list) avc_node_delete(avc, node); rcu_read_unlock(); spin_unlock_irqrestore(lock, flag); } } /** * avc_ss_reset - Flush the cache and revalidate migrated permissions. * @seqno: policy sequence number */ int avc_ss_reset(struct selinux_avc *avc, u32 seqno) { struct avc_callback_node *c; int rc = 0, tmprc; avc_flush(avc); for (c = avc_callbacks; c; c = c->next) { if (c->events & AVC_CALLBACK_RESET) { tmprc = c->callback(AVC_CALLBACK_RESET); /* save the first error encountered for the return value and continue processing the callbacks */ if (!rc) rc = tmprc; } } avc_latest_notif_update(avc, seqno, 0); return rc; } /* * Slow-path helper function for avc_has_perm_noaudit, * when the avc_node lookup fails. We get called with * the RCU read lock held, and need to return with it * still held, but drop if for the security compute. * * Don't inline this, since it's the slow-path and just * results in a bigger stack frame. */ static noinline struct avc_node *avc_compute_av(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd, struct avc_xperms_node *xp_node) { rcu_read_unlock(); INIT_LIST_HEAD(&xp_node->xpd_head); security_compute_av(state, ssid, tsid, tclass, avd, &xp_node->xp); rcu_read_lock(); return avc_insert(state->avc, ssid, tsid, tclass, avd, xp_node); } static noinline int avc_denied(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, u8 driver, u8 xperm, unsigned int flags, struct av_decision *avd) { if (flags & AVC_STRICT) return -EACCES; if (enforcing_enabled(state) && !(avd->flags & AVD_FLAGS_PERMISSIVE)) return -EACCES; avc_update_node(state->avc, AVC_CALLBACK_GRANT, requested, driver, xperm, ssid, tsid, tclass, avd->seqno, NULL, flags); return 0; } /* * The avc extended permissions logic adds an additional 256 bits of * permissions to an avc node when extended permissions for that node are * specified in the avtab. If the additional 256 permissions is not adequate, * as-is the case with ioctls, then multiple may be chained together and the * driver field is used to specify which set contains the permission. */ int avc_has_extended_perms(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, u8 driver, u8 xperm, struct common_audit_data *ad) { struct avc_node *node; struct av_decision avd; u32 denied; struct extended_perms_decision local_xpd; struct extended_perms_decision *xpd = NULL; struct extended_perms_data allowed; struct extended_perms_data auditallow; struct extended_perms_data dontaudit; struct avc_xperms_node local_xp_node; struct avc_xperms_node *xp_node; int rc = 0, rc2; xp_node = &local_xp_node; if (WARN_ON(!requested)) return -EACCES; rcu_read_lock(); node = avc_lookup(state->avc, ssid, tsid, tclass); if (unlikely(!node)) { node = avc_compute_av(state, ssid, tsid, tclass, &avd, xp_node); } else { memcpy(&avd, &node->ae.avd, sizeof(avd)); xp_node = node->ae.xp_node; } /* if extended permissions are not defined, only consider av_decision */ if (!xp_node || !xp_node->xp.len) goto decision; local_xpd.allowed = &allowed; local_xpd.auditallow = &auditallow; local_xpd.dontaudit = &dontaudit; xpd = avc_xperms_decision_lookup(driver, xp_node); if (unlikely(!xpd)) { /* * Compute the extended_perms_decision only if the driver * is flagged */ if (!security_xperm_test(xp_node->xp.drivers.p, driver)) { avd.allowed &= ~requested; goto decision; } rcu_read_unlock(); security_compute_xperms_decision(state, ssid, tsid, tclass, driver, &local_xpd); rcu_read_lock(); avc_update_node(state->avc, AVC_CALLBACK_ADD_XPERMS, requested, driver, xperm, ssid, tsid, tclass, avd.seqno, &local_xpd, 0); } else { avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd); } xpd = &local_xpd; if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED)) avd.allowed &= ~requested; decision: denied = requested & ~(avd.allowed); if (unlikely(denied)) rc = avc_denied(state, ssid, tsid, tclass, requested, driver, xperm, AVC_EXTENDED_PERMS, &avd); rcu_read_unlock(); rc2 = avc_xperms_audit(state, ssid, tsid, tclass, requested, &avd, xpd, xperm, rc, ad); if (rc2) return rc2; return rc; } /** * avc_has_perm_noaudit - Check permissions but perform no auditing. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @requested: requested permissions, interpreted based on @tclass * @flags: AVC_STRICT, AVC_NONBLOCKING, or 0 * @avd: access vector decisions * * Check the AVC to determine whether the @requested permissions are granted * for the SID pair (@ssid, @tsid), interpreting the permissions * based on @tclass, and call the security server on a cache miss to obtain * a new decision and add it to the cache. Return a copy of the decisions * in @avd. Return %0 if all @requested permissions are granted, * -%EACCES if any permissions are denied, or another -errno upon * other errors. This function is typically called by avc_has_perm(), * but may also be called directly to separate permission checking from * auditing, e.g. in cases where a lock must be held for the check but * should be released for the auditing. */ inline int avc_has_perm_noaudit(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, unsigned int flags, struct av_decision *avd) { struct avc_node *node; struct avc_xperms_node xp_node; int rc = 0; u32 denied; if (WARN_ON(!requested)) return -EACCES; rcu_read_lock(); node = avc_lookup(state->avc, ssid, tsid, tclass); if (unlikely(!node)) node = avc_compute_av(state, ssid, tsid, tclass, avd, &xp_node); else memcpy(avd, &node->ae.avd, sizeof(*avd)); denied = requested & ~(avd->allowed); if (unlikely(denied)) rc = avc_denied(state, ssid, tsid, tclass, requested, 0, 0, flags, avd); rcu_read_unlock(); return rc; } /** * avc_has_perm - Check permissions and perform any appropriate auditing. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @requested: requested permissions, interpreted based on @tclass * @auditdata: auxiliary audit data * * Check the AVC to determine whether the @requested permissions are granted * for the SID pair (@ssid, @tsid), interpreting the permissions * based on @tclass, and call the security server on a cache miss to obtain * a new decision and add it to the cache. Audit the granting or denial of * permissions in accordance with the policy. Return %0 if all @requested * permissions are granted, -%EACCES if any permissions are denied, or * another -errno upon other errors. */ int avc_has_perm(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata) { struct av_decision avd; int rc, rc2; rc = avc_has_perm_noaudit(state, ssid, tsid, tclass, requested, 0, &avd); rc2 = avc_audit(state, ssid, tsid, tclass, requested, &avd, rc, auditdata, 0); if (rc2) return rc2; return rc; } int avc_has_perm_flags(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata, int flags) { struct av_decision avd; int rc, rc2; rc = avc_has_perm_noaudit(state, ssid, tsid, tclass, requested, (flags & MAY_NOT_BLOCK) ? AVC_NONBLOCKING : 0, &avd); rc2 = avc_audit(state, ssid, tsid, tclass, requested, &avd, rc, auditdata, flags); if (rc2) return rc2; return rc; } u32 avc_policy_seqno(struct selinux_state *state) { return state->avc->avc_cache.latest_notif; } void avc_disable(void) { /* * If you are looking at this because you have realized that we are * not destroying the avc_node_cachep it might be easy to fix, but * I don't know the memory barrier semantics well enough to know. It's * possible that some other task dereferenced security_ops when * it still pointed to selinux operations. If that is the case it's * possible that it is about to use the avc and is about to need the * avc_node_cachep. I know I could wrap the security.c security_ops call * in an rcu_lock, but seriously, it's not worth it. Instead I just flush * the cache and get that memory back. */ if (avc_node_cachep) { avc_flush(selinux_state.avc); /* kmem_cache_destroy(avc_node_cachep); */ } }
1 1 1 2 2 2 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 // SPDX-License-Identifier: GPL-2.0-only /* * NSA Security-Enhanced Linux (SELinux) security module * * This file contains the SELinux hook function implementations. * * Authors: Stephen Smalley, <sds@tycho.nsa.gov> * Chris Vance, <cvance@nai.com> * Wayne Salamon, <wsalamon@nai.com> * James Morris <jmorris@redhat.com> * * Copyright (C) 2001,2002 Networks Associates Technology, Inc. * Copyright (C) 2003-2008 Red Hat, Inc., James Morris <jmorris@redhat.com> * Eric Paris <eparis@redhat.com> * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * <dgoeddel@trustedcs.com> * Copyright (C) 2006, 2007, 2009 Hewlett-Packard Development Company, L.P. * Paul Moore <paul@paul-moore.com> * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd. * Yuichi Nakamura <ynakam@hitachisoft.jp> * Copyright (C) 2016 Mellanox Technologies */ #include <linux/init.h> #include <linux/kd.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> #include <linux/tracehook.h> #include <linux/errno.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/lsm_hooks.h> #include <linux/xattr.h> #include <linux/capability.h> #include <linux/unistd.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/swap.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/dcache.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/tty.h> #include <net/icmp.h> #include <net/ip.h> /* for local_port_range[] */ #include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */ #include <net/inet_connection_sock.h> #include <net/net_namespace.h> #include <net/netlabel.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/interrupt.h> #include <linux/netdevice.h> /* for network interface checks */ #include <net/netlink.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <net/sctp/structs.h> #include <linux/quota.h> #include <linux/un.h> /* for Unix socket types */ #include <net/af_unix.h> /* for Unix socket types */ #include <linux/parser.h> #include <linux/nfs_mount.h> #include <net/ipv6.h> #include <linux/hugetlb.h> #include <linux/personality.h> #include <linux/audit.h> #include <linux/string.h> #include <linux/mutex.h> #include <linux/posix-timers.h> #include <linux/syslog.h> #include <linux/user_namespace.h> #include <linux/export.h> #include <linux/msg.h> #include <linux/shm.h> #include <linux/bpf.h> #include <linux/kernfs.h> #include <linux/stringhash.h> /* for hashlen_string() */ #include <uapi/linux/mount.h> #include <linux/fsnotify.h> #include <linux/fanotify.h> #include "avc.h" #include "objsec.h" #include "netif.h" #include "netnode.h" #include "netport.h" #include "ibpkey.h" #include "xfrm.h" #include "netlabel.h" #include "audit.h" #include "avc_ss.h" struct selinux_state selinux_state; /* SECMARK reference count */ static atomic_t selinux_secmark_refcount = ATOMIC_INIT(0); #ifdef CONFIG_SECURITY_SELINUX_DEVELOP static int selinux_enforcing_boot __initdata; static int __init enforcing_setup(char *str) { unsigned long enforcing; if (!kstrtoul(str, 0, &enforcing)) selinux_enforcing_boot = enforcing ? 1 : 0; return 1; } __setup("enforcing=", enforcing_setup); #else #define selinux_enforcing_boot 1 #endif int selinux_enabled_boot __initdata = 1; #ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM static int __init selinux_enabled_setup(char *str) { unsigned long enabled; if (!kstrtoul(str, 0, &enabled)) selinux_enabled_boot = enabled ? 1 : 0; return 1; } __setup("selinux=", selinux_enabled_setup); #endif static unsigned int selinux_checkreqprot_boot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; static int __init checkreqprot_setup(char *str) { unsigned long checkreqprot; if (!kstrtoul(str, 0, &checkreqprot)) { selinux_checkreqprot_boot = checkreqprot ? 1 : 0; if (checkreqprot) pr_warn("SELinux: checkreqprot set to 1 via kernel parameter. This is deprecated and will be rejected in a future kernel release.\n"); } return 1; } __setup("checkreqprot=", checkreqprot_setup); /** * selinux_secmark_enabled - Check to see if SECMARK is currently enabled * * Description: * This function checks the SECMARK reference counter to see if any SECMARK * targets are currently configured, if the reference counter is greater than * zero SECMARK is considered to be enabled. Returns true (1) if SECMARK is * enabled, false (0) if SECMARK is disabled. If the always_check_network * policy capability is enabled, SECMARK is always considered enabled. * */ static int selinux_secmark_enabled(void) { return (selinux_policycap_alwaysnetwork() || atomic_read(&selinux_secmark_refcount)); } /** * selinux_peerlbl_enabled - Check to see if peer labeling is currently enabled * * Description: * This function checks if NetLabel or labeled IPSEC is enabled. Returns true * (1) if any are enabled or false (0) if neither are enabled. If the * always_check_network policy capability is enabled, peer labeling * is always considered enabled. * */ static int selinux_peerlbl_enabled(void) { return (selinux_policycap_alwaysnetwork() || netlbl_enabled() || selinux_xfrm_enabled()); } static int selinux_netcache_avc_callback(u32 event) { if (event == AVC_CALLBACK_RESET) { sel_netif_flush(); sel_netnode_flush(); sel_netport_flush(); synchronize_net(); } return 0; } static int selinux_lsm_notifier_avc_callback(u32 event) { if (event == AVC_CALLBACK_RESET) { sel_ib_pkey_flush(); call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL); } return 0; } /* * initialise the security for the init task */ static void cred_init_security(void) { struct cred *cred = (struct cred *) current->real_cred; struct task_security_struct *tsec; tsec = selinux_cred(cred); tsec->osid = tsec->sid = SECINITSID_KERNEL; } /* * get the security ID of a set of credentials */ static inline u32 cred_sid(const struct cred *cred) { const struct task_security_struct *tsec; tsec = selinux_cred(cred); return tsec->sid; } /* * get the objective security ID of a task */ static inline u32 task_sid(const struct task_struct *task) { u32 sid; rcu_read_lock(); sid = cred_sid(__task_cred(task)); rcu_read_unlock(); return sid; } static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry); /* * Try reloading inode security labels that have been marked as invalid. The * @may_sleep parameter indicates when sleeping and thus reloading labels is * allowed; when set to false, returns -ECHILD when the label is * invalid. The @dentry parameter should be set to a dentry of the inode. */ static int __inode_security_revalidate(struct inode *inode, struct dentry *dentry, bool may_sleep) { struct inode_security_struct *isec = selinux_inode(inode); might_sleep_if(may_sleep); if (selinux_initialized(&selinux_state) && isec->initialized != LABEL_INITIALIZED) { if (!may_sleep) return -ECHILD; /* * Try reloading the inode security label. This will fail if * @opt_dentry is NULL and no dentry for this inode can be * found; in that case, continue using the old label. */ inode_doinit_with_dentry(inode, dentry); } return 0; } static struct inode_security_struct *inode_security_novalidate(struct inode *inode) { return selinux_inode(inode); } static struct inode_security_struct *inode_security_rcu(struct inode *inode, bool rcu) { int error; error = __inode_security_revalidate(inode, NULL, !rcu); if (error) return ERR_PTR(error); return selinux_inode(inode); } /* * Get the security label of an inode. */ static struct inode_security_struct *inode_security(struct inode *inode) { __inode_security_revalidate(inode, NULL, true); return selinux_inode(inode); } static struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); return selinux_inode(inode); } /* * Get the security label of a dentry's backing inode. */ static struct inode_security_struct *backing_inode_security(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); __inode_security_revalidate(inode, dentry, true); return selinux_inode(inode); } static void inode_free_security(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); struct superblock_security_struct *sbsec; if (!isec) return; sbsec = inode->i_sb->s_security; /* * As not all inode security structures are in a list, we check for * empty list outside of the lock to make sure that we won't waste * time taking a lock doing nothing. * * The list_del_init() function can be safely called more than once. * It should not be possible for this function to be called with * concurrent list_add(), but for better safety against future changes * in the code, we use list_empty_careful() here. */ if (!list_empty_careful(&isec->list)) { spin_lock(&sbsec->isec_lock); list_del_init(&isec->list); spin_unlock(&sbsec->isec_lock); } } static void superblock_free_security(struct super_block *sb) { struct superblock_security_struct *sbsec = sb->s_security; sb->s_security = NULL; kfree(sbsec); } struct selinux_mnt_opts { const char *fscontext, *context, *rootcontext, *defcontext; }; static void selinux_free_mnt_opts(void *mnt_opts) { struct selinux_mnt_opts *opts = mnt_opts; kfree(opts->fscontext); kfree(opts->context); kfree(opts->rootcontext); kfree(opts->defcontext); kfree(opts); } enum { Opt_error = -1, Opt_context = 0, Opt_defcontext = 1, Opt_fscontext = 2, Opt_rootcontext = 3, Opt_seclabel = 4, }; #define A(s, has_arg) {#s, sizeof(#s) - 1, Opt_##s, has_arg} static struct { const char *name; int len; int opt; bool has_arg; } tokens[] = { A(context, true), A(fscontext, true), A(defcontext, true), A(rootcontext, true), A(seclabel, false), }; #undef A static int match_opt_prefix(char *s, int l, char **arg) { int i; for (i = 0; i < ARRAY_SIZE(tokens); i++) { size_t len = tokens[i].len; if (len > l || memcmp(s, tokens[i].name, len)) continue; if (tokens[i].has_arg) { if (len == l || s[len] != '=') continue; *arg = s + len + 1; } else if (len != l) continue; return tokens[i].opt; } return Opt_error; } #define SEL_MOUNT_FAIL_MSG "SELinux: duplicate or incompatible mount options\n" static int may_context_mount_sb_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { const struct task_security_struct *tsec = selinux_cred(cred); int rc; rc = avc_has_perm(&selinux_state, tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; rc = avc_has_perm(&selinux_state, tsec->sid, sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELTO, NULL); return rc; } static int may_context_mount_inode_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { const struct task_security_struct *tsec = selinux_cred(cred); int rc; rc = avc_has_perm(&selinux_state, tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, NULL); return rc; } static int selinux_is_genfs_special_handling(struct super_block *sb) { /* Special handling. Genfs but also in-core setxattr handler */ return !strcmp(sb->s_type->name, "sysfs") || !strcmp(sb->s_type->name, "pstore") || !strcmp(sb->s_type->name, "debugfs") || !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "rootfs") || (selinux_policycap_cgroupseclabel() && (!strcmp(sb->s_type->name, "cgroup") || !strcmp(sb->s_type->name, "cgroup2"))); } static int selinux_is_sblabel_mnt(struct super_block *sb) { struct superblock_security_struct *sbsec = sb->s_security; /* * IMPORTANT: Double-check logic in this function when adding a new * SECURITY_FS_USE_* definition! */ BUILD_BUG_ON(SECURITY_FS_USE_MAX != 7); switch (sbsec->behavior) { case SECURITY_FS_USE_XATTR: case SECURITY_FS_USE_TRANS: case SECURITY_FS_USE_TASK: case SECURITY_FS_USE_NATIVE: return 1; case SECURITY_FS_USE_GENFS: return selinux_is_genfs_special_handling(sb); /* Never allow relabeling on context mounts */ case SECURITY_FS_USE_MNTPOINT: case SECURITY_FS_USE_NONE: default: return 0; } } static int sb_finish_set_opts(struct super_block *sb) { struct superblock_security_struct *sbsec = sb->s_security; struct dentry *root = sb->s_root; struct inode *root_inode = d_backing_inode(root); int rc = 0; if (sbsec->behavior == SECURITY_FS_USE_XATTR) { /* Make sure that the xattr handler exists and that no error other than -ENODATA is returned by getxattr on the root directory. -ENODATA is ok, as this may be the first boot of the SELinux kernel before we have assigned xattr values to the filesystem. */ if (!(root_inode->i_opflags & IOP_XATTR)) { pr_warn("SELinux: (dev %s, type %s) has no " "xattr support\n", sb->s_id, sb->s_type->name); rc = -EOPNOTSUPP; goto out; } rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0); if (rc < 0 && rc != -ENODATA) { if (rc == -EOPNOTSUPP) pr_warn("SELinux: (dev %s, type " "%s) has no security xattr handler\n", sb->s_id, sb->s_type->name); else pr_warn("SELinux: (dev %s, type " "%s) getxattr errno %d\n", sb->s_id, sb->s_type->name, -rc); goto out; } } sbsec->flags |= SE_SBINITIALIZED; /* * Explicitly set or clear SBLABEL_MNT. It's not sufficient to simply * leave the flag untouched because sb_clone_mnt_opts might be handing * us a superblock that needs the flag to be cleared. */ if (selinux_is_sblabel_mnt(sb)) sbsec->flags |= SBLABEL_MNT; else sbsec->flags &= ~SBLABEL_MNT; /* Initialize the root inode. */ rc = inode_doinit_with_dentry(root_inode, root); /* Initialize any other inodes associated with the superblock, e.g. inodes created prior to initial policy load or inodes created during get_sb by a pseudo filesystem that directly populates itself. */ spin_lock(&sbsec->isec_lock); while (!list_empty(&sbsec->isec_head)) { struct inode_security_struct *isec = list_first_entry(&sbsec->isec_head, struct inode_security_struct, list); struct inode *inode = isec->inode; list_del_init(&isec->list); spin_unlock(&sbsec->isec_lock); inode = igrab(inode); if (inode) { if (!IS_PRIVATE(inode)) inode_doinit_with_dentry(inode, NULL); iput(inode); } spin_lock(&sbsec->isec_lock); } spin_unlock(&sbsec->isec_lock); out: return rc; } static int bad_option(struct superblock_security_struct *sbsec, char flag, u32 old_sid, u32 new_sid) { char mnt_flags = sbsec->flags & SE_MNTMASK; /* check if the old mount command had the same options */ if (sbsec->flags & SE_SBINITIALIZED) if (!(sbsec->flags & flag) || (old_sid != new_sid)) return 1; /* check if we were passed the same options twice, * aka someone passed context=a,context=b */ if (!(sbsec->flags & SE_SBINITIALIZED)) if (mnt_flags & flag) return 1; return 0; } static int parse_sid(struct super_block *sb, const char *s, u32 *sid) { int rc = security_context_str_to_sid(&selinux_state, s, sid, GFP_KERNEL); if (rc) pr_warn("SELinux: security_context_str_to_sid" "(%s) failed for (dev %s, type %s) errno=%d\n", s, sb->s_id, sb->s_type->name, rc); return rc; } /* * Allow filesystems with binary mount data to explicitly set mount point * labeling information. */ static int selinux_set_mnt_opts(struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) { const struct cred *cred = current_cred(); struct superblock_security_struct *sbsec = sb->s_security; struct dentry *root = sbsec->sb->s_root; struct selinux_mnt_opts *opts = mnt_opts; struct inode_security_struct *root_isec; u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0; u32 defcontext_sid = 0; int rc = 0; mutex_lock(&sbsec->lock); if (!selinux_initialized(&selinux_state)) { if (!opts) { /* Defer initialization until selinux_complete_init, after the initial policy is loaded and the security server is ready to handle calls. */ goto out; } rc = -EINVAL; pr_warn("SELinux: Unable to set superblock options " "before the security server is initialized\n"); goto out; } if (kern_flags && !set_kern_flags) { /* Specifying internal flags without providing a place to * place the results is not allowed */ rc = -EINVAL; goto out; } /* * Binary mount data FS will come through this function twice. Once * from an explicit call and once from the generic calls from the vfs. * Since the generic VFS calls will not contain any security mount data * we need to skip the double mount verification. * * This does open a hole in which we will not notice if the first * mount using this sb set explict options and a second mount using * this sb does not set any security options. (The first options * will be used for both mounts) */ if ((sbsec->flags & SE_SBINITIALIZED) && (sb->s_type->fs_flags & FS_BINARY_MOUNTDATA) && !opts) goto out; root_isec = backing_inode_security_novalidate(root); /* * parse the mount options, check if they are valid sids. * also check if someone is trying to mount the same sb more * than once with different security options. */ if (opts) { if (opts->fscontext) { rc = parse_sid(sb, opts->fscontext, &fscontext_sid); if (rc) goto out; if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, fscontext_sid)) goto out_double_mount; sbsec->flags |= FSCONTEXT_MNT; } if (opts->context) { rc = parse_sid(sb, opts->context, &context_sid); if (rc) goto out; if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, context_sid)) goto out_double_mount; sbsec->flags |= CONTEXT_MNT; } if (opts->rootcontext) { rc = parse_sid(sb, opts->rootcontext, &rootcontext_sid); if (rc) goto out; if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, rootcontext_sid)) goto out_double_mount; sbsec->flags |= ROOTCONTEXT_MNT; } if (opts->defcontext) { rc = parse_sid(sb, opts->defcontext, &defcontext_sid); if (rc) goto out; if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, defcontext_sid)) goto out_double_mount; sbsec->flags |= DEFCONTEXT_MNT; } } if (sbsec->flags & SE_SBINITIALIZED) { /* previously mounted with options, but not on this attempt? */ if ((sbsec->flags & SE_MNTMASK) && !opts) goto out_double_mount; rc = 0; goto out; } if (strcmp(sb->s_type->name, "proc") == 0) sbsec->flags |= SE_SBPROC | SE_SBGENFS; if (!strcmp(sb->s_type->name, "debugfs") || !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "binder") || !strcmp(sb->s_type->name, "bpf") || !strcmp(sb->s_type->name, "pstore")) sbsec->flags |= SE_SBGENFS; if (!strcmp(sb->s_type->name, "sysfs") || !strcmp(sb->s_type->name, "cgroup") || !strcmp(sb->s_type->name, "cgroup2")) sbsec->flags |= SE_SBGENFS | SE_SBGENFS_XATTR; if (!sbsec->behavior) { /* * Determine the labeling behavior to use for this * filesystem type. */ rc = security_fs_use(&selinux_state, sb); if (rc) { pr_warn("%s: security_fs_use(%s) returned %d\n", __func__, sb->s_type->name, rc); goto out; } } /* * If this is a user namespace mount and the filesystem type is not * explicitly whitelisted, then no contexts are allowed on the command * line and security labels must be ignored. */ if (sb->s_user_ns != &init_user_ns && strcmp(sb->s_type->name, "tmpfs") && strcmp(sb->s_type->name, "ramfs") && strcmp(sb->s_type->name, "devpts")) { if (context_sid || fscontext_sid || rootcontext_sid || defcontext_sid) { rc = -EACCES; goto out; } if (sbsec->behavior == SECURITY_FS_USE_XATTR) { sbsec->behavior = SECURITY_FS_USE_MNTPOINT; rc = security_transition_sid(&selinux_state, current_sid(), current_sid(), SECCLASS_FILE, NULL, &sbsec->mntpoint_sid); if (rc) goto out; } goto out_set_opts; } /* sets the context of the superblock for the fs being mounted. */ if (fscontext_sid) { rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred); if (rc) goto out; sbsec->sid = fscontext_sid; } /* * Switch to using mount point labeling behavior. * sets the label used on all file below the mountpoint, and will set * the superblock context if not already set. */ if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !context_sid) { sbsec->behavior = SECURITY_FS_USE_NATIVE; *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; } if (context_sid) { if (!fscontext_sid) { rc = may_context_mount_sb_relabel(context_sid, sbsec, cred); if (rc) goto out; sbsec->sid = context_sid; } else { rc = may_context_mount_inode_relabel(context_sid, sbsec, cred); if (rc) goto out; } if (!rootcontext_sid) rootcontext_sid = context_sid; sbsec->mntpoint_sid = context_sid; sbsec->behavior = SECURITY_FS_USE_MNTPOINT; } if (rootcontext_sid) { rc = may_context_mount_inode_relabel(rootcontext_sid, sbsec, cred); if (rc) goto out; root_isec->sid = rootcontext_sid; root_isec->initialized = LABEL_INITIALIZED; } if (defcontext_sid) { if (sbsec->behavior != SECURITY_FS_USE_XATTR && sbsec->behavior != SECURITY_FS_USE_NATIVE) { rc = -EINVAL; pr_warn("SELinux: defcontext option is " "invalid for this filesystem type\n"); goto out; } if (defcontext_sid != sbsec->def_sid) { rc = may_context_mount_inode_relabel(defcontext_sid, sbsec, cred); if (rc) goto out; } sbsec->def_sid = defcontext_sid; } out_set_opts: rc = sb_finish_set_opts(sb); out: mutex_unlock(&sbsec->lock); return rc; out_double_mount: rc = -EINVAL; pr_warn("SELinux: mount invalid. Same superblock, different " "security settings for (dev %s, type %s)\n", sb->s_id, sb->s_type->name); goto out; } static int selinux_cmp_sb_context(const struct super_block *oldsb, const struct super_block *newsb) { struct superblock_security_struct *old = oldsb->s_security; struct superblock_security_struct *new = newsb->s_security; char oldflags = old->flags & SE_MNTMASK; char newflags = new->flags & SE_MNTMASK; if (oldflags != newflags) goto mismatch; if ((oldflags & FSCONTEXT_MNT) && old->sid != new->sid) goto mismatch; if ((oldflags & CONTEXT_MNT) && old->mntpoint_sid != new->mntpoint_sid) goto mismatch; if ((oldflags & DEFCONTEXT_MNT) && old->def_sid != new->def_sid) goto mismatch; if (oldflags & ROOTCONTEXT_MNT) { struct inode_security_struct *oldroot = backing_inode_security(oldsb->s_root); struct inode_security_struct *newroot = backing_inode_security(newsb->s_root); if (oldroot->sid != newroot->sid) goto mismatch; } return 0; mismatch: pr_warn("SELinux: mount invalid. Same superblock, " "different security settings for (dev %s, " "type %s)\n", newsb->s_id, newsb->s_type->name); return -EBUSY; } static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) { int rc = 0; const struct superblock_security_struct *oldsbsec = oldsb->s_security; struct superblock_security_struct *newsbsec = newsb->s_security; int set_fscontext = (oldsbsec->flags & FSCONTEXT_MNT); int set_context = (oldsbsec->flags & CONTEXT_MNT); int set_rootcontext = (oldsbsec->flags & ROOTCONTEXT_MNT); /* * if the parent was able to be mounted it clearly had no special lsm * mount options. thus we can safely deal with this superblock later */ if (!selinux_initialized(&selinux_state)) return 0; /* * Specifying internal flags without providing a place to * place the results is not allowed. */ if (kern_flags && !set_kern_flags) return -EINVAL; /* how can we clone if the old one wasn't set up?? */ BUG_ON(!(oldsbsec->flags & SE_SBINITIALIZED)); /* if fs is reusing a sb, make sure that the contexts match */ if (newsbsec->flags & SE_SBINITIALIZED) { if ((kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; return selinux_cmp_sb_context(oldsb, newsb); } mutex_lock(&newsbsec->lock); newsbsec->flags = oldsbsec->flags; newsbsec->sid = oldsbsec->sid; newsbsec->def_sid = oldsbsec->def_sid; newsbsec->behavior = oldsbsec->behavior; if (newsbsec->behavior == SECURITY_FS_USE_NATIVE && !(kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) { rc = security_fs_use(&selinux_state, newsb); if (rc) goto out; } if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !set_context) { newsbsec->behavior = SECURITY_FS_USE_NATIVE; *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; } if (set_context) { u32 sid = oldsbsec->mntpoint_sid; if (!set_fscontext) newsbsec->sid = sid; if (!set_rootcontext) { struct inode_security_struct *newisec = backing_inode_security(newsb->s_root); newisec->sid = sid; } newsbsec->mntpoint_sid = sid; } if (set_rootcontext) { const struct inode_security_struct *oldisec = backing_inode_security(oldsb->s_root); struct inode_security_struct *newisec = backing_inode_security(newsb->s_root); newisec->sid = oldisec->sid; } sb_finish_set_opts(newsb); out: mutex_unlock(&newsbsec->lock); return rc; } static int selinux_add_opt(int token, const char *s, void **mnt_opts) { struct selinux_mnt_opts *opts = *mnt_opts; bool is_alloc_opts = false; if (token == Opt_seclabel) /* eaten and completely ignored */ return 0; if (!s) return -ENOMEM; if (!opts) { opts = kzalloc(sizeof(struct selinux_mnt_opts), GFP_KERNEL); if (!opts) return -ENOMEM; *mnt_opts = opts; is_alloc_opts = true; } switch (token) { case Opt_context: if (opts->context || opts->defcontext) goto Einval; opts->context = s; break; case Opt_fscontext: if (opts->fscontext) goto Einval; opts->fscontext = s; break; case Opt_rootcontext: if (opts->rootcontext) goto Einval; opts->rootcontext = s; break; case Opt_defcontext: if (opts->context || opts->defcontext) goto Einval; opts->defcontext = s; break; } return 0; Einval: if (is_alloc_opts) { kfree(opts); *mnt_opts = NULL; } pr_warn(SEL_MOUNT_FAIL_MSG); return -EINVAL; } static int selinux_add_mnt_opt(const char *option, const char *val, int len, void **mnt_opts) { int token = Opt_error; int rc, i; for (i = 0; i < ARRAY_SIZE(tokens); i++) { if (strcmp(option, tokens[i].name) == 0) { token = tokens[i].opt; break; } } if (token == Opt_error) return -EINVAL; if (token != Opt_seclabel) { val = kmemdup_nul(val, len, GFP_KERNEL); if (!val) { rc = -ENOMEM; goto free_opt; } } rc = selinux_add_opt(token, val, mnt_opts); if (unlikely(rc)) { kfree(val); goto free_opt; } return rc; free_opt: if (*mnt_opts) { selinux_free_mnt_opts(*mnt_opts); *mnt_opts = NULL; } return rc; } static int show_sid(struct seq_file *m, u32 sid) { char *context = NULL; u32 len; int rc; rc = security_sid_to_context(&selinux_state, sid, &context, &len); if (!rc) { bool has_comma = context && strchr(context, ','); seq_putc(m, '='); if (has_comma) seq_putc(m, '\"'); seq_escape(m, context, "\"\n\\"); if (has_comma) seq_putc(m, '\"'); } kfree(context); return rc; } static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb) { struct superblock_security_struct *sbsec = sb->s_security; int rc; if (!(sbsec->flags & SE_SBINITIALIZED)) return 0; if (!selinux_initialized(&selinux_state)) return 0; if (sbsec->flags & FSCONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, FSCONTEXT_STR); rc = show_sid(m, sbsec->sid); if (rc) return rc; } if (sbsec->flags & CONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, CONTEXT_STR); rc = show_sid(m, sbsec->mntpoint_sid); if (rc) return rc; } if (sbsec->flags & DEFCONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, DEFCONTEXT_STR); rc = show_sid(m, sbsec->def_sid); if (rc) return rc; } if (sbsec->flags & ROOTCONTEXT_MNT) { struct dentry *root = sbsec->sb->s_root; struct inode_security_struct *isec = backing_inode_security(root); seq_putc(m, ','); seq_puts(m, ROOTCONTEXT_STR); rc = show_sid(m, isec->sid); if (rc) return rc; } if (sbsec->flags & SBLABEL_MNT) { seq_putc(m, ','); seq_puts(m, SECLABEL_STR); } return 0; } static inline u16 inode_mode_to_security_class(umode_t mode) { switch (mode & S_IFMT) { case S_IFSOCK: return SECCLASS_SOCK_FILE; case S_IFLNK: return SECCLASS_LNK_FILE; case S_IFREG: return SECCLASS_FILE; case S_IFBLK: return SECCLASS_BLK_FILE; case S_IFDIR: return SECCLASS_DIR; case S_IFCHR: return SECCLASS_CHR_FILE; case S_IFIFO: return SECCLASS_FIFO_FILE; } return SECCLASS_FILE; } static inline int default_protocol_stream(int protocol) { return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP); } static inline int default_protocol_dgram(int protocol) { return (protocol == IPPROTO_IP || protocol == IPPROTO_UDP); } static inline u16 socket_type_to_security_class(int family, int type, int protocol) { int extsockclass = selinux_policycap_extsockclass(); switch (family) { case PF_UNIX: switch (type) { case SOCK_STREAM: case SOCK_SEQPACKET: return SECCLASS_UNIX_STREAM_SOCKET; case SOCK_DGRAM: case SOCK_RAW: return SECCLASS_UNIX_DGRAM_SOCKET; } break; case PF_INET: case PF_INET6: switch (type) { case SOCK_STREAM: case SOCK_SEQPACKET: if (default_protocol_stream(protocol)) return SECCLASS_TCP_SOCKET; else if (extsockclass && protocol == IPPROTO_SCTP) return SECCLASS_SCTP_SOCKET; else return SECCLASS_RAWIP_SOCKET; case SOCK_DGRAM: if (default_protocol_dgram(protocol)) return SECCLASS_UDP_SOCKET; else if (extsockclass && (protocol == IPPROTO_ICMP || protocol == IPPROTO_ICMPV6)) return SECCLASS_ICMP_SOCKET; else return SECCLASS_RAWIP_SOCKET; case SOCK_DCCP: return SECCLASS_DCCP_SOCKET; default: return SECCLASS_RAWIP_SOCKET; } break; case PF_NETLINK: switch (protocol) { case NETLINK_ROUTE: return SECCLASS_NETLINK_ROUTE_SOCKET; case NETLINK_SOCK_DIAG: return SECCLASS_NETLINK_TCPDIAG_SOCKET; case NETLINK_NFLOG: return SECCLASS_NETLINK_NFLOG_SOCKET; case NETLINK_XFRM: return SECCLASS_NETLINK_XFRM_SOCKET; case NETLINK_SELINUX: return SECCLASS_NETLINK_SELINUX_SOCKET; case NETLINK_ISCSI: return SECCLASS_NETLINK_ISCSI_SOCKET; case NETLINK_AUDIT: return SECCLASS_NETLINK_AUDIT_SOCKET; case NETLINK_FIB_LOOKUP: return SECCLASS_NETLINK_FIB_LOOKUP_SOCKET; case NETLINK_CONNECTOR: return SECCLASS_NETLINK_CONNECTOR_SOCKET; case NETLINK_NETFILTER: return SECCLASS_NETLINK_NETFILTER_SOCKET; case NETLINK_DNRTMSG: return SECCLASS_NETLINK_DNRT_SOCKET; case NETLINK_KOBJECT_UEVENT: return SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET; case NETLINK_GENERIC: return SECCLASS_NETLINK_GENERIC_SOCKET; case NETLINK_SCSITRANSPORT: return SECCLASS_NETLINK_SCSITRANSPORT_SOCKET; case NETLINK_RDMA: return SECCLASS_NETLINK_RDMA_SOCKET; case NETLINK_CRYPTO: return SECCLASS_NETLINK_CRYPTO_SOCKET; default: return SECCLASS_NETLINK_SOCKET; } case PF_PACKET: return SECCLASS_PACKET_SOCKET; case PF_KEY: return SECCLASS_KEY_SOCKET; case PF_APPLETALK: return SECCLASS_APPLETALK_SOCKET; } if (extsockclass) { switch (family) { case PF_AX25: return SECCLASS_AX25_SOCKET; case PF_IPX: return SECCLASS_IPX_SOCKET; case PF_NETROM: return SECCLASS_NETROM_SOCKET; case PF_ATMPVC: return SECCLASS_ATMPVC_SOCKET; case PF_X25: return SECCLASS_X25_SOCKET; case PF_ROSE: return SECCLASS_ROSE_SOCKET; case PF_DECnet: return SECCLASS_DECNET_SOCKET; case PF_ATMSVC: return SECCLASS_ATMSVC_SOCKET; case PF_RDS: return SECCLASS_RDS_SOCKET; case PF_IRDA: return SECCLASS_IRDA_SOCKET; case PF_PPPOX: return SECCLASS_PPPOX_SOCKET; case PF_LLC: return SECCLASS_LLC_SOCKET; case PF_CAN: return SECCLASS_CAN_SOCKET; case PF_TIPC: return SECCLASS_TIPC_SOCKET; case PF_BLUETOOTH: return SECCLASS_BLUETOOTH_SOCKET; case PF_IUCV: return SECCLASS_IUCV_SOCKET; case PF_RXRPC: return SECCLASS_RXRPC_SOCKET; case PF_ISDN: return SECCLASS_ISDN_SOCKET; case PF_PHONET: return SECCLASS_PHONET_SOCKET; case PF_IEEE802154: return SECCLASS_IEEE802154_SOCKET; case PF_CAIF: return SECCLASS_CAIF_SOCKET; case PF_ALG: return SECCLASS_ALG_SOCKET; case PF_NFC: return SECCLASS_NFC_SOCKET; case PF_VSOCK: return SECCLASS_VSOCK_SOCKET; case PF_KCM: return SECCLASS_KCM_SOCKET; case PF_QIPCRTR: return SECCLASS_QIPCRTR_SOCKET; case PF_SMC: return SECCLASS_SMC_SOCKET; case PF_XDP: return SECCLASS_XDP_SOCKET; #if PF_MAX > 45 #error New address family defined, please update this function. #endif } } return SECCLASS_SOCKET; } static int selinux_genfs_get_sid(struct dentry *dentry, u16 tclass, u16 flags, u32 *sid) { int rc; struct super_block *sb = dentry->d_sb; char *buffer, *path; buffer = (char *)__get_free_page(GFP_KERNEL); if (!buffer) return -ENOMEM; path = dentry_path_raw(dentry, buffer, PAGE_SIZE); if (IS_ERR(path)) rc = PTR_ERR(path); else { if (flags & SE_SBPROC) { /* each process gets a /proc/PID/ entry. Strip off the * PID part to get a valid selinux labeling. * e.g. /proc/1/net/rpc/nfs -> /net/rpc/nfs */ while (path[1] >= '0' && path[1] <= '9') { path[1] = '/'; path++; } } rc = security_genfs_sid(&selinux_state, sb->s_type->name, path, tclass, sid); if (rc == -ENOENT) { /* No match in policy, mark as unlabeled. */ *sid = SECINITSID_UNLABELED; rc = 0; } } free_page((unsigned long)buffer); return rc; } static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry, u32 def_sid, u32 *sid) { #define INITCONTEXTLEN 255 char *context; unsigned int len; int rc; len = INITCONTEXTLEN; context = kmalloc(len + 1, GFP_NOFS); if (!context) return -ENOMEM; context[len] = '\0'; rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len); if (rc == -ERANGE) { kfree(context); /* Need a larger buffer. Query for the right size. */ rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, NULL, 0); if (rc < 0) return rc; len = rc; context = kmalloc(len + 1, GFP_NOFS); if (!context) return -ENOMEM; context[len] = '\0'; rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len); } if (rc < 0) { kfree(context); if (rc != -ENODATA) { pr_warn("SELinux: %s: getxattr returned %d for dev=%s ino=%ld\n", __func__, -rc, inode->i_sb->s_id, inode->i_ino); return rc; } *sid = def_sid; return 0; } rc = security_context_to_sid_default(&selinux_state, context, rc, sid, def_sid, GFP_NOFS); if (rc) { char *dev = inode->i_sb->s_id; unsigned long ino = inode->i_ino; if (rc == -EINVAL) { pr_notice_ratelimited("SELinux: inode=%lu on dev=%s was found to have an invalid context=%s. This indicates you may need to relabel the inode or the filesystem in question.\n", ino, dev, context); } else { pr_warn("SELinux: %s: context_to_sid(%s) returned %d for dev=%s ino=%ld\n", __func__, context, -rc, dev, ino); } } kfree(context); return 0; } /* The inode's security attributes must be initialized before first use. */ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry) { struct superblock_security_struct *sbsec = NULL; struct inode_security_struct *isec = selinux_inode(inode); u32 task_sid, sid = 0; u16 sclass; struct dentry *dentry; int rc = 0; if (isec->initialized == LABEL_INITIALIZED) return 0; spin_lock(&isec->lock); if (isec->initialized == LABEL_INITIALIZED) goto out_unlock; if (isec->sclass == SECCLASS_FILE) isec->sclass = inode_mode_to_security_class(inode->i_mode); sbsec = inode->i_sb->s_security; if (!(sbsec->flags & SE_SBINITIALIZED)) { /* Defer initialization until selinux_complete_init, after the initial policy is loaded and the security server is ready to handle calls. */ spin_lock(&sbsec->isec_lock); if (list_empty(&isec->list)) list_add(&isec->list, &sbsec->isec_head); spin_unlock(&sbsec->isec_lock); goto out_unlock; } sclass = isec->sclass; task_sid = isec->task_sid; sid = isec->sid; isec->initialized = LABEL_PENDING; spin_unlock(&isec->lock); switch (sbsec->behavior) { case SECURITY_FS_USE_NATIVE: break; case SECURITY_FS_USE_XATTR: if (!(inode->i_opflags & IOP_XATTR)) { sid = sbsec->def_sid; break; } /* Need a dentry, since the xattr API requires one. Life would be simpler if we could just pass the inode. */ if (opt_dentry) { /* Called from d_instantiate or d_splice_alias. */ dentry = dget(opt_dentry); } else { /* * Called from selinux_complete_init, try to find a dentry. * Some filesystems really want a connected one, so try * that first. We could split SECURITY_FS_USE_XATTR in * two, depending upon that... */ dentry = d_find_alias(inode); if (!dentry) dentry = d_find_any_alias(inode); } if (!dentry) { /* * this is can be hit on boot when a file is accessed * before the policy is loaded. When we load policy we * may find inodes that have no dentry on the * sbsec->isec_head list. No reason to complain as these * will get fixed up the next time we go through * inode_doinit with a dentry, before these inodes could * be used again by userspace. */ goto out_invalid; } rc = inode_doinit_use_xattr(inode, dentry, sbsec->def_sid, &sid); dput(dentry); if (rc) goto out; break; case SECURITY_FS_USE_TASK: sid = task_sid; break; case SECURITY_FS_USE_TRANS: /* Default to the fs SID. */ sid = sbsec->sid; /* Try to obtain a transition SID. */ rc = security_transition_sid(&selinux_state, task_sid, sid, sclass, NULL, &sid); if (rc) goto out; break; case SECURITY_FS_USE_MNTPOINT: sid = sbsec->mntpoint_sid; break; default: /* Default to the fs superblock SID. */ sid = sbsec->sid; if ((sbsec->flags & SE_SBGENFS) && (!S_ISLNK(inode->i_mode) || selinux_policycap_genfs_seclabel_symlinks())) { /* We must have a dentry to determine the label on * procfs inodes */ if (opt_dentry) { /* Called from d_instantiate or * d_splice_alias. */ dentry = dget(opt_dentry); } else { /* Called from selinux_complete_init, try to * find a dentry. Some filesystems really want * a connected one, so try that first. */ dentry = d_find_alias(inode); if (!dentry) dentry = d_find_any_alias(inode); } /* * This can be hit on boot when a file is accessed * before the policy is loaded. When we load policy we * may find inodes that have no dentry on the * sbsec->isec_head list. No reason to complain as * these will get fixed up the next time we go through * inode_doinit() with a dentry, before these inodes * could be used again by userspace. */ if (!dentry) goto out_invalid; rc = selinux_genfs_get_sid(dentry, sclass, sbsec->flags, &sid); if (rc) { dput(dentry); goto out; } if ((sbsec->flags & SE_SBGENFS_XATTR) && (inode->i_opflags & IOP_XATTR)) { rc = inode_doinit_use_xattr(inode, dentry, sid, &sid); if (rc) { dput(dentry); goto out; } } dput(dentry); } break; } out: spin_lock(&isec->lock); if (isec->initialized == LABEL_PENDING) { if (rc) { isec->initialized = LABEL_INVALID; goto out_unlock; } isec->initialized = LABEL_INITIALIZED; isec->sid = sid; } out_unlock: spin_unlock(&isec->lock); return rc; out_invalid: spin_lock(&isec->lock); if (isec->initialized == LABEL_PENDING) { isec->initialized = LABEL_INVALID; isec->sid = sid; } spin_unlock(&isec->lock); return 0; } /* Convert a Linux signal to an access vector. */ static inline u32 signal_to_av(int sig) { u32 perm = 0; switch (sig) { case SIGCHLD: /* Commonly granted from child to parent. */ perm = PROCESS__SIGCHLD; break; case SIGKILL: /* Cannot be caught or ignored */ perm = PROCESS__SIGKILL; break; case SIGSTOP: /* Cannot be caught or ignored */ perm = PROCESS__SIGSTOP; break; default: /* All other signals. */ perm = PROCESS__SIGNAL; break; } return perm; } #if CAP_LAST_CAP > 63 #error Fix SELinux to handle capabilities > 63. #endif /* Check whether a task is allowed to use a capability. */ static int cred_has_capability(const struct cred *cred, int cap, unsigned int opts, bool initns) { struct common_audit_data ad; struct av_decision avd; u16 sclass; u32 sid = cred_sid(cred); u32 av = CAP_TO_MASK(cap); int rc; ad.type = LSM_AUDIT_DATA_CAP; ad.u.cap = cap; switch (CAP_TO_INDEX(cap)) { case 0: sclass = initns ? SECCLASS_CAPABILITY : SECCLASS_CAP_USERNS; break; case 1: sclass = initns ? SECCLASS_CAPABILITY2 : SECCLASS_CAP2_USERNS; break; default: pr_err("SELinux: out of range capability %d\n", cap); BUG(); return -EINVAL; } rc = avc_has_perm_noaudit(&selinux_state, sid, sid, sclass, av, 0, &avd); if (!(opts & CAP_OPT_NOAUDIT)) { int rc2 = avc_audit(&selinux_state, sid, sid, sclass, av, &avd, rc, &ad, 0); if (rc2) return rc2; } return rc; } /* Check whether a task has a particular permission to an inode. The 'adp' parameter is optional and allows other audit data to be passed (e.g. the dentry). */ static int inode_has_perm(const struct cred *cred, struct inode *inode, u32 perms, struct common_audit_data *adp) { struct inode_security_struct *isec; u32 sid; validate_creds(cred); if (unlikely(IS_PRIVATE(inode))) return 0; sid = cred_sid(cred); isec = selinux_inode(inode); return avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, perms, adp); } /* Same as inode_has_perm, but pass explicit audit data containing the dentry to help the auditing code to more easily generate the pathname if needed. */ static inline int dentry_has_perm(const struct cred *cred, struct dentry *dentry, u32 av) { struct inode *inode = d_backing_inode(dentry); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; __inode_security_revalidate(inode, dentry, true); return inode_has_perm(cred, inode, av, &ad); } /* Same as inode_has_perm, but pass explicit audit data containing the path to help the auditing code to more easily generate the pathname if needed. */ static inline int path_has_perm(const struct cred *cred, const struct path *path, u32 av) { struct inode *inode = d_backing_inode(path->dentry); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = *path; __inode_security_revalidate(inode, path->dentry, true); return inode_has_perm(cred, inode, av, &ad); } /* Same as path_has_perm, but uses the inode from the file struct. */ static inline int file_path_has_perm(const struct cred *cred, struct file *file, u32 av) { struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; return inode_has_perm(cred, file_inode(file), av, &ad); } #ifdef CONFIG_BPF_SYSCALL static int bpf_fd_pass(struct file *file, u32 sid); #endif /* Check whether a task can use an open file descriptor to access an inode in a given way. Check access to the descriptor itself, and then use dentry_has_perm to check a particular permission to the file. Access to the descriptor is implicitly granted if it has the same SID as the process. If av is zero, then access to the file is not checked, e.g. for cases where only the descriptor is affected like seek. */ static int file_has_perm(const struct cred *cred, struct file *file, u32 av) { struct file_security_struct *fsec = selinux_file(file); struct inode *inode = file_inode(file); struct common_audit_data ad; u32 sid = cred_sid(cred); int rc; ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; if (sid != fsec->sid) { rc = avc_has_perm(&selinux_state, sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) goto out; } #ifdef CONFIG_BPF_SYSCALL rc = bpf_fd_pass(file, cred_sid(cred)); if (rc) return rc; #endif /* av is zero if only checking access to the descriptor. */ rc = 0; if (av) rc = inode_has_perm(cred, inode, av, &ad); out: return rc; } /* * Determine the label for an inode that might be unioned. */ static int selinux_determine_inode_label(const struct task_security_struct *tsec, struct inode *dir, const struct qstr *name, u16 tclass, u32 *_new_isid) { const struct superblock_security_struct *sbsec = dir->i_sb->s_security; if ((sbsec->flags & SE_SBINITIALIZED) && (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) { *_new_isid = sbsec->mntpoint_sid; } else if ((sbsec->flags & SBLABEL_MNT) && tsec->create_sid) { *_new_isid = tsec->create_sid; } else { const struct inode_security_struct *dsec = inode_security(dir); return security_transition_sid(&selinux_state, tsec->sid, dsec->sid, tclass, name, _new_isid); } return 0; } /* Check whether a task can create a file. */ static int may_create(struct inode *dir, struct dentry *dentry, u16 tclass) { const struct task_security_struct *tsec = selinux_cred(current_cred()); struct inode_security_struct *dsec; struct superblock_security_struct *sbsec; u32 sid, newsid; struct common_audit_data ad; int rc; dsec = inode_security(dir); sbsec = dir->i_sb->s_security; sid = tsec->sid; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; rc = avc_has_perm(&selinux_state, sid, dsec->sid, SECCLASS_DIR, DIR__ADD_NAME | DIR__SEARCH, &ad); if (rc) return rc; rc = selinux_determine_inode_label(tsec, dir, &dentry->d_name, tclass, &newsid); if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, newsid, tclass, FILE__CREATE, &ad); if (rc) return rc; return avc_has_perm(&selinux_state, newsid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, &ad); } #define MAY_LINK 0 #define MAY_UNLINK 1 #define MAY_RMDIR 2 /* Check whether a task can link, unlink, or rmdir a file/directory. */ static int may_link(struct inode *dir, struct dentry *dentry, int kind) { struct inode_security_struct *dsec, *isec; struct common_audit_data ad; u32 sid = current_sid(); u32 av; int rc; dsec = inode_security(dir); isec = backing_inode_security(dentry); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; av = DIR__SEARCH; av |= (kind ? DIR__REMOVE_NAME : DIR__ADD_NAME); rc = avc_has_perm(&selinux_state, sid, dsec->sid, SECCLASS_DIR, av, &ad); if (rc) return rc; switch (kind) { case MAY_LINK: av = FILE__LINK; break; case MAY_UNLINK: av = FILE__UNLINK; break; case MAY_RMDIR: av = DIR__RMDIR; break; default: pr_warn("SELinux: %s: unrecognized kind %d\n", __func__, kind); return 0; } rc = avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, av, &ad); return rc; } static inline int may_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode_security_struct *old_dsec, *new_dsec, *old_isec, *new_isec; struct common_audit_data ad; u32 sid = current_sid(); u32 av; int old_is_dir, new_is_dir; int rc; old_dsec = inode_security(old_dir); old_isec = backing_inode_security(old_dentry); old_is_dir = d_is_dir(old_dentry); new_dsec = inode_security(new_dir); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = old_dentry; rc = avc_has_perm(&selinux_state, sid, old_dsec->sid, SECCLASS_DIR, DIR__REMOVE_NAME | DIR__SEARCH, &ad); if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, old_isec->sid, old_isec->sclass, FILE__RENAME, &ad); if (rc) return rc; if (old_is_dir && new_dir != old_dir) { rc = avc_has_perm(&selinux_state, sid, old_isec->sid, old_isec->sclass, DIR__REPARENT, &ad); if (rc) return rc; } ad.u.dentry = new_dentry; av = DIR__ADD_NAME | DIR__SEARCH; if (d_is_positive(new_dentry)) av |= DIR__REMOVE_NAME; rc = avc_has_perm(&selinux_state, sid, new_dsec->sid, SECCLASS_DIR, av, &ad); if (rc) return rc; if (d_is_positive(new_dentry)) { new_isec = backing_inode_security(new_dentry); new_is_dir = d_is_dir(new_dentry); rc = avc_has_perm(&selinux_state, sid, new_isec->sid, new_isec->sclass, (new_is_dir ? DIR__RMDIR : FILE__UNLINK), &ad); if (rc) return rc; } return 0; } /* Check whether a task can perform a filesystem operation. */ static int superblock_has_perm(const struct cred *cred, struct super_block *sb, u32 perms, struct common_audit_data *ad) { struct superblock_security_struct *sbsec; u32 sid = cred_sid(cred); sbsec = sb->s_security; return avc_has_perm(&selinux_state, sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad); } /* Convert a Linux mode and permission mask to an access vector. */ static inline u32 file_mask_to_av(int mode, int mask) { u32 av = 0; if (!S_ISDIR(mode)) { if (mask & MAY_EXEC) av |= FILE__EXECUTE; if (mask & MAY_READ) av |= FILE__READ; if (mask & MAY_APPEND) av |= FILE__APPEND; else if (mask & MAY_WRITE) av |= FILE__WRITE; } else { if (mask & MAY_EXEC) av |= DIR__SEARCH; if (mask & MAY_WRITE) av |= DIR__WRITE; if (mask & MAY_READ) av |= DIR__READ; } return av; } /* Convert a Linux file to an access vector. */ static inline u32 file_to_av(struct file *file) { u32 av = 0; if (file->f_mode & FMODE_READ) av |= FILE__READ; if (file->f_mode & FMODE_WRITE) { if (file->f_flags & O_APPEND) av |= FILE__APPEND; else av |= FILE__WRITE; } if (!av) { /* * Special file opened with flags 3 for ioctl-only use. */ av = FILE__IOCTL; } return av; } /* * Convert a file to an access vector and include the correct * open permission. */ static inline u32 open_file_to_av(struct file *file) { u32 av = file_to_av(file); struct inode *inode = file_inode(file); if (selinux_policycap_openperm() && inode->i_sb->s_magic != SOCKFS_MAGIC) av |= FILE__OPEN; return av; } /* Hook functions begin here. */ static int selinux_binder_set_context_mgr(const struct cred *mgr) { return avc_has_perm(&selinux_state, current_sid(), cred_sid(mgr), SECCLASS_BINDER, BINDER__SET_CONTEXT_MGR, NULL); } static int selinux_binder_transaction(const struct cred *from, const struct cred *to) { u32 mysid = current_sid(); u32 fromsid = cred_sid(from); u32 tosid = cred_sid(to); int rc; if (mysid != fromsid) { rc = avc_has_perm(&selinux_state, mysid, fromsid, SECCLASS_BINDER, BINDER__IMPERSONATE, NULL); if (rc) return rc; } return avc_has_perm(&selinux_state, fromsid, tosid, SECCLASS_BINDER, BINDER__CALL, NULL); } static int selinux_binder_transfer_binder(const struct cred *from, const struct cred *to) { return avc_has_perm(&selinux_state, cred_sid(from), cred_sid(to), SECCLASS_BINDER, BINDER__TRANSFER, NULL); } static int selinux_binder_transfer_file(const struct cred *from, const struct cred *to, struct file *file) { u32 sid = cred_sid(to); struct file_security_struct *fsec = selinux_file(file); struct dentry *dentry = file->f_path.dentry; struct inode_security_struct *isec; struct common_audit_data ad; int rc; ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = file->f_path; if (sid != fsec->sid) { rc = avc_has_perm(&selinux_state, sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) return rc; } #ifdef CONFIG_BPF_SYSCALL rc = bpf_fd_pass(file, sid); if (rc) return rc; #endif if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; isec = backing_inode_security(dentry); return avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, file_to_av(file), &ad); } static int selinux_ptrace_access_check(struct task_struct *child, unsigned int mode) { u32 sid = current_sid(); u32 csid = task_sid(child); if (mode & PTRACE_MODE_READ) return avc_has_perm(&selinux_state, sid, csid, SECCLASS_FILE, FILE__READ, NULL); return avc_has_perm(&selinux_state, sid, csid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); } static int selinux_ptrace_traceme(struct task_struct *parent) { return avc_has_perm(&selinux_state, task_sid(parent), current_sid(), SECCLASS_PROCESS, PROCESS__PTRACE, NULL); } static int selinux_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { return avc_has_perm(&selinux_state, current_sid(), task_sid(target), SECCLASS_PROCESS, PROCESS__GETCAP, NULL); } static int selinux_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { return avc_has_perm(&selinux_state, cred_sid(old), cred_sid(new), SECCLASS_PROCESS, PROCESS__SETCAP, NULL); } /* * (This comment used to live with the selinux_task_setuid hook, * which was removed). * * Since setuid only affects the current process, and since the SELinux * controls are not based on the Linux identity attributes, SELinux does not * need to control this operation. However, SELinux does control the use of * the CAP_SETUID and CAP_SETGID capabilities using the capable hook. */ static int selinux_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { return cred_has_capability(cred, cap, opts, ns == &init_user_ns); } static int selinux_quotactl(int cmds, int type, int id, struct super_block *sb) { const struct cred *cred = current_cred(); int rc = 0; if (!sb) return 0; switch (cmds) { case Q_SYNC: case Q_QUOTAON: case Q_QUOTAOFF: case Q_SETINFO: case Q_SETQUOTA: case Q_XQUOTAOFF: case Q_XQUOTAON: case Q_XSETQLIM: rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAMOD, NULL); break; case Q_GETFMT: case Q_GETINFO: case Q_GETQUOTA: case Q_XGETQUOTA: case Q_XGETQSTAT: case Q_XGETQSTATV: case Q_XGETNEXTQUOTA: rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAGET, NULL); break; default: rc = 0; /* let the kernel handle invalid cmds */ break; } return rc; } static int selinux_quota_on(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__QUOTAON); } static int selinux_syslog(int type) { switch (type) { case SYSLOG_ACTION_READ_ALL: /* Read last kernel messages */ case SYSLOG_ACTION_SIZE_BUFFER: /* Return size of the log buffer */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_READ, NULL); case SYSLOG_ACTION_CONSOLE_OFF: /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: /* Enable logging to console */ /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_CONSOLE, NULL); } /* All other syslog types */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_MOD, NULL); } /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * Do not audit the selinux permission check, as this is applied to all * processes that allocate mappings. */ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages) { int rc, cap_sys_admin = 0; rc = cred_has_capability(current_cred(), CAP_SYS_ADMIN, CAP_OPT_NOAUDIT, true); if (rc == 0) cap_sys_admin = 1; return cap_sys_admin; } /* binprm security operations */ static u32 ptrace_parent_sid(void) { u32 sid = 0; struct task_struct *tracer; rcu_read_lock(); tracer = ptrace_parent(current); if (tracer) sid = task_sid(tracer); rcu_read_unlock(); return sid; } static int check_nnp_nosuid(const struct linux_binprm *bprm, const struct task_security_struct *old_tsec, const struct task_security_struct *new_tsec) { int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS); int nosuid = !mnt_may_suid(bprm->file->f_path.mnt); int rc; u32 av; if (!nnp && !nosuid) return 0; /* neither NNP nor nosuid */ if (new_tsec->sid == old_tsec->sid) return 0; /* No change in credentials */ /* * If the policy enables the nnp_nosuid_transition policy capability, * then we permit transitions under NNP or nosuid if the * policy allows the corresponding permission between * the old and new contexts. */ if (selinux_policycap_nnp_nosuid_transition()) { av = 0; if (nnp) av |= PROCESS2__NNP_TRANSITION; if (nosuid) av |= PROCESS2__NOSUID_TRANSITION; rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS2, av, NULL); if (!rc) return 0; } /* * We also permit NNP or nosuid transitions to bounded SIDs, * i.e. SIDs that are guaranteed to only be allowed a subset * of the permissions of the current SID. */ rc = security_bounded_transition(&selinux_state, old_tsec->sid, new_tsec->sid); if (!rc) return 0; /* * On failure, preserve the errno values for NNP vs nosuid. * NNP: Operation not permitted for caller. * nosuid: Permission denied to file. */ if (nnp) return -EPERM; return -EACCES; } static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) { const struct task_security_struct *old_tsec; struct task_security_struct *new_tsec; struct inode_security_struct *isec; struct common_audit_data ad; struct inode *inode = file_inode(bprm->file); int rc; /* SELinux context only depends on initial program or script and not * the script interpreter */ old_tsec = selinux_cred(current_cred()); new_tsec = selinux_cred(bprm->cred); isec = inode_security(inode); /* Default to the current task SID. */ new_tsec->sid = old_tsec->sid; new_tsec->osid = old_tsec->sid; /* Reset fs, key, and sock SIDs on execve. */ new_tsec->create_sid = 0; new_tsec->keycreate_sid = 0; new_tsec->sockcreate_sid = 0; if (old_tsec->exec_sid) { new_tsec->sid = old_tsec->exec_sid; /* Reset exec SID on execve. */ new_tsec->exec_sid = 0; /* Fail on NNP or nosuid if not an allowed transition. */ rc = check_nnp_nosuid(bprm, old_tsec, new_tsec); if (rc) return rc; } else { /* Check for a default transition on this program. */ rc = security_transition_sid(&selinux_state, old_tsec->sid, isec->sid, SECCLASS_PROCESS, NULL, &new_tsec->sid); if (rc) return rc; /* * Fallback to old SID on NNP or nosuid if not an allowed * transition. */ rc = check_nnp_nosuid(bprm, old_tsec, new_tsec); if (rc) new_tsec->sid = old_tsec->sid; } ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = bprm->file; if (new_tsec->sid == old_tsec->sid) { rc = avc_has_perm(&selinux_state, old_tsec->sid, isec->sid, SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad); if (rc) return rc; } else { /* Check permissions for the transition. */ rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__TRANSITION, &ad); if (rc) return rc; rc = avc_has_perm(&selinux_state, new_tsec->sid, isec->sid, SECCLASS_FILE, FILE__ENTRYPOINT, &ad); if (rc) return rc; /* Check for shared state */ if (bprm->unsafe & LSM_UNSAFE_SHARE) { rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__SHARE, NULL); if (rc) return -EPERM; } /* Make sure that anyone attempting to ptrace over a task that * changes its SID has the appropriate permit */ if (bprm->unsafe & LSM_UNSAFE_PTRACE) { u32 ptsid = ptrace_parent_sid(); if (ptsid != 0) { rc = avc_has_perm(&selinux_state, ptsid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); if (rc) return -EPERM; } } /* Clear any possibly unsafe personality bits on exec: */ bprm->per_clear |= PER_CLEAR_ON_SETID; /* Enable secure mode for SIDs transitions unless the noatsecure permission is granted between the two SIDs, i.e. ahp returns 0. */ rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__NOATSECURE, NULL); bprm->secureexec |= !!rc; } return 0; } static int match_file(const void *p, struct file *file, unsigned fd) { return file_has_perm(p, file, file_to_av(file)) ? fd + 1 : 0; } /* Derived from fs/exec.c:flush_old_files. */ static inline void flush_unauthorized_files(const struct cred *cred, struct files_struct *files) { struct file *file, *devnull = NULL; struct tty_struct *tty; int drop_tty = 0; unsigned n; tty = get_current_tty(); if (tty) { spin_lock(&tty->files_lock); if (!list_empty(&tty->tty_files)) { struct tty_file_private *file_priv; /* Revalidate access to controlling tty. Use file_path_has_perm on the tty path directly rather than using file_has_perm, as this particular open file may belong to another process and we are only interested in the inode-based check here. */ file_priv = list_first_entry(&tty->tty_files, struct tty_file_private, list); file = file_priv->file; if (file_path_has_perm(cred, file, FILE__READ | FILE__WRITE)) drop_tty = 1; } spin_unlock(&tty->files_lock); tty_kref_put(tty); } /* Reset controlling tty. */ if (drop_tty) no_tty(); /* Revalidate access to inherited open files. */ n = iterate_fd(files, 0, match_file, cred); if (!n) /* none found? */ return; devnull = dentry_open(&selinux_null, O_RDWR, cred); if (IS_ERR(devnull)) devnull = NULL; /* replace all the matching ones with this */ do { replace_fd(n - 1, devnull, 0); } while ((n = iterate_fd(files, n, match_file, cred)) != 0); if (devnull) fput(devnull); } /* * Prepare a process for imminent new credential changes due to exec */ static void selinux_bprm_committing_creds(struct linux_binprm *bprm) { struct task_security_struct *new_tsec; struct rlimit *rlim, *initrlim; int rc, i; new_tsec = selinux_cred(bprm->cred); if (new_tsec->sid == new_tsec->osid) return; /* Close files for which the new task SID is not authorized. */ flush_unauthorized_files(bprm->cred, current->files); /* Always clear parent death signal on SID transitions. */ current->pdeath_signal = 0; /* Check whether the new SID can inherit resource limits from the old * SID. If not, reset all soft limits to the lower of the current * task's hard limit and the init task's soft limit. * * Note that the setting of hard limits (even to lower them) can be * controlled by the setrlimit check. The inclusion of the init task's * soft limit into the computation is to avoid resetting soft limits * higher than the default soft limit for cases where the default is * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK. */ rc = avc_has_perm(&selinux_state, new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__RLIMITINH, NULL); if (rc) { /* protect against do_prlimit() */ task_lock(current); for (i = 0; i < RLIM_NLIMITS; i++) { rlim = current->signal->rlim + i; initrlim = init_task.signal->rlim + i; rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); } task_unlock(current); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) update_rlimit_cpu(current, rlimit(RLIMIT_CPU)); } } /* * Clean up the process immediately after the installation of new credentials * due to exec */ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) { const struct task_security_struct *tsec = selinux_cred(current_cred()); u32 osid, sid; int rc; osid = tsec->osid; sid = tsec->sid; if (sid == osid) return; /* Check whether the new SID can inherit signal state from the old SID. * If not, clear itimers to avoid subsequent signal generation and * flush and unblock signals. * * This must occur _after_ the task SID has been updated so that any * kill done after the flush will be checked against the new SID. */ rc = avc_has_perm(&selinux_state, osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL); if (rc) { clear_itimer(); spin_lock_irq(&current->sighand->siglock); if (!fatal_signal_pending(current)) { flush_sigqueue(&current->pending); flush_sigqueue(&current->signal->shared_pending); flush_signal_handlers(current, 1); sigemptyset(&current->blocked); recalc_sigpending(); } spin_unlock_irq(&current->sighand->siglock); } /* Wake up the parent if it is waiting so that it can recheck * wait permission to the new task SID. */ read_lock(&tasklist_lock); __wake_up_parent(current, current->real_parent); read_unlock(&tasklist_lock); } /* superblock security operations */ static int selinux_sb_alloc_security(struct super_block *sb) { struct superblock_security_struct *sbsec; sbsec = kzalloc(sizeof(struct superblock_security_struct), GFP_KERNEL); if (!sbsec) return -ENOMEM; mutex_init(&sbsec->lock); INIT_LIST_HEAD(&sbsec->isec_head); spin_lock_init(&sbsec->isec_lock); sbsec->sb = sb; sbsec->sid = SECINITSID_UNLABELED; sbsec->def_sid = SECINITSID_FILE; sbsec->mntpoint_sid = SECINITSID_UNLABELED; sb->s_security = sbsec; return 0; } static void selinux_sb_free_security(struct super_block *sb) { superblock_free_security(sb); } static inline int opt_len(const char *s) { bool open_quote = false; int len; char c; for (len = 0; (c = s[len]) != '\0'; len++) { if (c == '"') open_quote = !open_quote; if (c == ',' && !open_quote) break; } return len; } static int selinux_sb_eat_lsm_opts(char *options, void **mnt_opts) { char *from = options; char *to = options; bool first = true; int rc; while (1) { int len = opt_len(from); int token; char *arg = NULL; token = match_opt_prefix(from, len, &arg); if (token != Opt_error) { char *p, *q; /* strip quotes */ if (arg) { for (p = q = arg; p < from + len; p++) { char c = *p; if (c != '"') *q++ = c; } arg = kmemdup_nul(arg, q - arg, GFP_KERNEL); if (!arg) { rc = -ENOMEM; goto free_opt; } } rc = selinux_add_opt(token, arg, mnt_opts); if (unlikely(rc)) { kfree(arg); goto free_opt; } } else { if (!first) { // copy with preceding comma from--; len++; } if (to != from) memmove(to, from, len); to += len; first = false; } if (!from[len]) break; from += len + 1; } *to = '\0'; return 0; free_opt: if (*mnt_opts) { selinux_free_mnt_opts(*mnt_opts); *mnt_opts = NULL; } return rc; } static int selinux_sb_remount(struct super_block *sb, void *mnt_opts) { struct selinux_mnt_opts *opts = mnt_opts; struct superblock_security_struct *sbsec = sb->s_security; u32 sid; int rc; if (!(sbsec->flags & SE_SBINITIALIZED)) return 0; if (!opts) return 0; if (opts->fscontext) { rc = parse_sid(sb, opts->fscontext, &sid); if (rc) return rc; if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, sid)) goto out_bad_option; } if (opts->context) { rc = parse_sid(sb, opts->context, &sid); if (rc) return rc; if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, sid)) goto out_bad_option; } if (opts->rootcontext) { struct inode_security_struct *root_isec; root_isec = backing_inode_security(sb->s_root); rc = parse_sid(sb, opts->rootcontext, &sid); if (rc) return rc; if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, sid)) goto out_bad_option; } if (opts->defcontext) { rc = parse_sid(sb, opts->defcontext, &sid); if (rc) return rc; if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, sid)) goto out_bad_option; } return 0; out_bad_option: pr_warn("SELinux: unable to change security options " "during remount (dev %s, type=%s)\n", sb->s_id, sb->s_type->name); return -EINVAL; } static int selinux_sb_kern_mount(struct super_block *sb) { const struct cred *cred = current_cred(); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = sb->s_root; return superblock_has_perm(cred, sb, FILESYSTEM__MOUNT, &ad); } static int selinux_sb_statfs(struct dentry *dentry) { const struct cred *cred = current_cred(); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry->d_sb->s_root; return superblock_has_perm(cred, dentry->d_sb, FILESYSTEM__GETATTR, &ad); } static int selinux_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { const struct cred *cred = current_cred(); if (flags & MS_REMOUNT) return superblock_has_perm(cred, path->dentry->d_sb, FILESYSTEM__REMOUNT, NULL); else return path_has_perm(cred, path, FILE__MOUNTON); } static int selinux_move_mount(const struct path *from_path, const struct path *to_path) { const struct cred *cred = current_cred(); return path_has_perm(cred, to_path, FILE__MOUNTON); } static int selinux_umount(struct vfsmount *mnt, int flags) { const struct cred *cred = current_cred(); return superblock_has_perm(cred, mnt->mnt_sb, FILESYSTEM__UNMOUNT, NULL); } static int selinux_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { const struct selinux_mnt_opts *src = src_fc->security; struct selinux_mnt_opts *opts; if (!src) return 0; fc->security = kzalloc(sizeof(struct selinux_mnt_opts), GFP_KERNEL); if (!fc->security) return -ENOMEM; opts = fc->security; if (src->fscontext) { opts->fscontext = kstrdup(src->fscontext, GFP_KERNEL); if (!opts->fscontext) return -ENOMEM; } if (src->context) { opts->context = kstrdup(src->context, GFP_KERNEL); if (!opts->context) return -ENOMEM; } if (src->rootcontext) { opts->rootcontext = kstrdup(src->rootcontext, GFP_KERNEL); if (!opts->rootcontext) return -ENOMEM; } if (src->defcontext) { opts->defcontext = kstrdup(src->defcontext, GFP_KERNEL); if (!opts->defcontext) return -ENOMEM; } return 0; } static const struct fs_parameter_spec selinux_fs_parameters[] = { fsparam_string(CONTEXT_STR, Opt_context), fsparam_string(DEFCONTEXT_STR, Opt_defcontext), fsparam_string(FSCONTEXT_STR, Opt_fscontext), fsparam_string(ROOTCONTEXT_STR, Opt_rootcontext), fsparam_flag (SECLABEL_STR, Opt_seclabel), {} }; static int selinux_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; int opt, rc; opt = fs_parse(fc, selinux_fs_parameters, param, &result); if (opt < 0) return opt; rc = selinux_add_opt(opt, param->string, &fc->security); if (!rc) param->string = NULL; return rc; } /* inode security operations */ static int selinux_inode_alloc_security(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); u32 sid = current_sid(); spin_lock_init(&isec->lock); INIT_LIST_HEAD(&isec->list); isec->inode = inode; isec->sid = SECINITSID_UNLABELED; isec->sclass = SECCLASS_FILE; isec->task_sid = sid; isec->initialized = LABEL_INVALID; return 0; } static void selinux_inode_free_security(struct inode *inode) { inode_free_security(inode); } static int selinux_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, void **ctx, u32 *ctxlen) { u32 newsid; int rc; rc = selinux_determine_inode_label(selinux_cred(current_cred()), d_inode(dentry->d_parent), name, inode_mode_to_security_class(mode), &newsid); if (rc) return rc; return security_sid_to_context(&selinux_state, newsid, (char **)ctx, ctxlen); } static int selinux_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) { u32 newsid; int rc; struct task_security_struct *tsec; rc = selinux_determine_inode_label(selinux_cred(old), d_inode(dentry->d_parent), name, inode_mode_to_security_class(mode), &newsid); if (rc) return rc; tsec = selinux_cred(new); tsec->create_sid = newsid; return 0; } static int selinux_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, const char **name, void **value, size_t *len) { const struct task_security_struct *tsec = selinux_cred(current_cred()); struct superblock_security_struct *sbsec; u32 newsid, clen; int rc; char *context; sbsec = dir->i_sb->s_security; newsid = tsec->create_sid; rc = selinux_determine_inode_label(tsec, dir, qstr, inode_mode_to_security_class(inode->i_mode), &newsid); if (rc) return rc; /* Possibly defer initialization to selinux_complete_init. */ if (sbsec->flags & SE_SBINITIALIZED) { struct inode_security_struct *isec = selinux_inode(inode); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; } if (!selinux_initialized(&selinux_state) || !(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (name) *name = XATTR_SELINUX_SUFFIX; if (value && len) { rc = security_sid_to_context_force(&selinux_state, newsid, &context, &clen); if (rc) return rc; *value = context; *len = clen; } return 0; } static int selinux_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode) { return may_create(dir, dentry, SECCLASS_FILE); } static int selinux_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { return may_link(dir, old_dentry, MAY_LINK); } static int selinux_inode_unlink(struct inode *dir, struct dentry *dentry) { return may_link(dir, dentry, MAY_UNLINK); } static int selinux_inode_symlink(struct inode *dir, struct dentry *dentry, const char *name) { return may_create(dir, dentry, SECCLASS_LNK_FILE); } static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mask) { return may_create(dir, dentry, SECCLASS_DIR); } static int selinux_inode_rmdir(struct inode *dir, struct dentry *dentry) { return may_link(dir, dentry, MAY_RMDIR); } static int selinux_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { return may_create(dir, dentry, inode_mode_to_security_class(mode)); } static int selinux_inode_rename(struct inode *old_inode, struct dentry *old_dentry, struct inode *new_inode, struct dentry *new_dentry) { return may_rename(old_inode, old_dentry, new_inode, new_dentry); } static int selinux_inode_readlink(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__READ); } static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) { const struct cred *cred = current_cred(); struct common_audit_data ad; struct inode_security_struct *isec; u32 sid; validate_creds(cred); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; sid = cred_sid(cred); isec = inode_security_rcu(inode, rcu); if (IS_ERR(isec)) return PTR_ERR(isec); return avc_has_perm_flags(&selinux_state, sid, isec->sid, isec->sclass, FILE__READ, &ad, rcu ? MAY_NOT_BLOCK : 0); } static noinline int audit_inode_permission(struct inode *inode, u32 perms, u32 audited, u32 denied, int result) { struct common_audit_data ad; struct inode_security_struct *isec = selinux_inode(inode); int rc; ad.type = LSM_AUDIT_DATA_INODE; ad.u.inode = inode; rc = slow_avc_audit(&selinux_state, current_sid(), isec->sid, isec->sclass, perms, audited, denied, result, &ad); if (rc) return rc; return 0; } static int selinux_inode_permission(struct inode *inode, int mask) { const struct cred *cred = current_cred(); u32 perms; bool from_access; bool no_block = mask & MAY_NOT_BLOCK; struct inode_security_struct *isec; u32 sid; struct av_decision avd; int rc, rc2; u32 audited, denied; from_access = mask & MAY_ACCESS; mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND); /* No permission to check. Existence test. */ if (!mask) return 0; validate_creds(cred); if (unlikely(IS_PRIVATE(inode))) return 0; perms = file_mask_to_av(inode->i_mode, mask); sid = cred_sid(cred); isec = inode_security_rcu(inode, no_block); if (IS_ERR(isec)) return PTR_ERR(isec); rc = avc_has_perm_noaudit(&selinux_state, sid, isec->sid, isec->sclass, perms, no_block ? AVC_NONBLOCKING : 0, &avd); audited = avc_audit_required(perms, &avd, rc, from_access ? FILE__AUDIT_ACCESS : 0, &denied); if (likely(!audited)) return rc; /* fall back to ref-walk if we have to generate audit */ if (no_block) return -ECHILD; rc2 = audit_inode_permission(inode, perms, audited, denied, rc); if (rc2) return rc2; return rc; } static int selinux_inode_setattr(struct dentry *dentry, struct iattr *iattr) { const struct cred *cred = current_cred(); struct inode *inode = d_backing_inode(dentry); unsigned int ia_valid = iattr->ia_valid; __u32 av = FILE__WRITE; /* ATTR_FORCE is just used for ATTR_KILL_S[UG]ID. */ if (ia_valid & ATTR_FORCE) { ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE | ATTR_FORCE); if (!ia_valid) return 0; } if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_TIMES_SET)) return dentry_has_perm(cred, dentry, FILE__SETATTR); if (selinux_policycap_openperm() && inode->i_sb->s_magic != SOCKFS_MAGIC && (ia_valid & ATTR_SIZE) && !(ia_valid & ATTR_FILE)) av |= FILE__OPEN; return dentry_has_perm(cred, dentry, av); } static int selinux_inode_getattr(const struct path *path) { return path_has_perm(current_cred(), path, FILE__GETATTR); } static bool has_cap_mac_admin(bool audit) { const struct cred *cred = current_cred(); unsigned int opts = audit ? CAP_OPT_NONE : CAP_OPT_NOAUDIT; if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, opts)) return false; if (cred_has_capability(cred, CAP_MAC_ADMIN, opts, true)) return false; return true; } static int selinux_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec; struct superblock_security_struct *sbsec; struct common_audit_data ad; u32 newsid, sid = current_sid(); int rc = 0; if (strcmp(name, XATTR_NAME_SELINUX)) { rc = cap_inode_setxattr(dentry, name, value, size, flags); if (rc) return rc; /* Not an attribute we recognize, so just check the ordinary setattr permission. */ return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } if (!selinux_initialized(&selinux_state)) return (inode_owner_or_capable(inode) ? 0 : -EPERM); sbsec = inode->i_sb->s_security; if (!(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (!inode_owner_or_capable(inode)) return -EPERM; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; isec = backing_inode_security(dentry); rc = avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, FILE__RELABELFROM, &ad); if (rc) return rc; rc = security_context_to_sid(&selinux_state, value, size, &newsid, GFP_KERNEL); if (rc == -EINVAL) { if (!has_cap_mac_admin(true)) { struct audit_buffer *ab; size_t audit_size; /* We strip a nul only if it is at the end, otherwise the * context contains a nul and we should audit that */ if (value) { const char *str = value; if (str[size - 1] == '\0') audit_size = size - 1; else audit_size = size; } else { audit_size = 0; } ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); audit_log_format(ab, "op=setxattr invalid_context="); audit_log_n_untrustedstring(ab, value, audit_size); audit_log_end(ab); return rc; } rc = security_context_to_sid_force(&selinux_state, value, size, &newsid); } if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, newsid, isec->sclass, FILE__RELABELTO, &ad); if (rc) return rc; rc = security_validate_transition(&selinux_state, isec->sid, newsid, sid, isec->sclass); if (rc) return rc; return avc_has_perm(&selinux_state, newsid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, &ad); } static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec; u32 newsid; int rc; if (strcmp(name, XATTR_NAME_SELINUX)) { /* Not an attribute we recognize, so nothing to do. */ return; } if (!selinux_initialized(&selinux_state)) { /* If we haven't even been initialized, then we can't validate * against a policy, so leave the label as invalid. It may * resolve to a valid label on the next revalidation try if * we've since initialized. */ return; } rc = security_context_to_sid_force(&selinux_state, value, size, &newsid); if (rc) { pr_err("SELinux: unable to map context to SID" "for (%s, %lu), rc=%d\n", inode->i_sb->s_id, inode->i_ino, -rc); return; } isec = backing_inode_security(dentry); spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); return; } static int selinux_inode_getxattr(struct dentry *dentry, const char *name) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__GETATTR); } static int selinux_inode_listxattr(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__GETATTR); } static int selinux_inode_removexattr(struct dentry *dentry, const char *name) { if (strcmp(name, XATTR_NAME_SELINUX)) { int rc = cap_inode_removexattr(dentry, name); if (rc) return rc; /* Not an attribute we recognize, so just check the ordinary setattr permission. */ return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } if (!selinux_initialized(&selinux_state)) return 0; /* No one is allowed to remove a SELinux security label. You can change the label, but all data must be labeled. */ return -EACCES; } static int selinux_path_notify(const struct path *path, u64 mask, unsigned int obj_type) { int ret; u32 perm; struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = *path; /* * Set permission needed based on the type of mark being set. * Performs an additional check for sb watches. */ switch (obj_type) { case FSNOTIFY_OBJ_TYPE_VFSMOUNT: perm = FILE__WATCH_MOUNT; break; case FSNOTIFY_OBJ_TYPE_SB: perm = FILE__WATCH_SB; ret = superblock_has_perm(current_cred(), path->dentry->d_sb, FILESYSTEM__WATCH, &ad); if (ret) return ret; break; case FSNOTIFY_OBJ_TYPE_INODE: perm = FILE__WATCH; break; default: return -EINVAL; } /* blocking watches require the file:watch_with_perm permission */ if (mask & (ALL_FSNOTIFY_PERM_EVENTS)) perm |= FILE__WATCH_WITH_PERM; /* watches on read-like events need the file:watch_reads permission */ if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE)) perm |= FILE__WATCH_READS; return path_has_perm(current_cred(), path, perm); } /* * Copy the inode security context value to the user. * * Permission check is handled by selinux_inode_getxattr hook. */ static int selinux_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc) { u32 size; int error; char *context = NULL; struct inode_security_struct *isec; /* * If we're not initialized yet, then we can't validate contexts, so * just let vfs_getxattr fall back to using the on-disk xattr. */ if (!selinux_initialized(&selinux_state) || strcmp(name, XATTR_SELINUX_SUFFIX)) return -EOPNOTSUPP; /* * If the caller has CAP_MAC_ADMIN, then get the raw context * value even if it is not defined by current policy; otherwise, * use the in-core value under current policy. * Use the non-auditing forms of the permission checks since * getxattr may be called by unprivileged processes commonly * and lack of permission just means that we fall back to the * in-core context value, not a denial. */ isec = inode_security(inode); if (has_cap_mac_admin(false)) error = security_sid_to_context_force(&selinux_state, isec->sid, &context, &size); else error = security_sid_to_context(&selinux_state, isec->sid, &context, &size); if (error) return error; error = size; if (alloc) { *buffer = context; goto out_nofree; } kfree(context); out_nofree: return error; } static int selinux_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct inode_security_struct *isec = inode_security_novalidate(inode); struct superblock_security_struct *sbsec = inode->i_sb->s_security; u32 newsid; int rc; if (strcmp(name, XATTR_SELINUX_SUFFIX)) return -EOPNOTSUPP; if (!(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (!value || !size) return -EACCES; rc = security_context_to_sid(&selinux_state, value, size, &newsid, GFP_KERNEL); if (rc) return rc; spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); return 0; } static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size) { const int len = sizeof(XATTR_NAME_SELINUX); if (!selinux_initialized(&selinux_state)) return 0; if (buffer && len <= buffer_size) memcpy(buffer, XATTR_NAME_SELINUX, len); return len; } static void selinux_inode_getsecid(struct inode *inode, u32 *secid) { struct inode_security_struct *isec = inode_security_novalidate(inode); *secid = isec->sid; } static int selinux_inode_copy_up(struct dentry *src, struct cred **new) { u32 sid; struct task_security_struct *tsec; struct cred *new_creds = *new; if (new_creds == NULL) { new_creds = prepare_creds(); if (!new_creds) return -ENOMEM; } tsec = selinux_cred(new_creds); /* Get label from overlay inode and set it in create_sid */ selinux_inode_getsecid(d_inode(src), &sid); tsec->create_sid = sid; *new = new_creds; return 0; } static int selinux_inode_copy_up_xattr(const char *name) { /* The copy_up hook above sets the initial context on an inode, but we * don't then want to overwrite it by blindly copying all the lower * xattrs up. Instead, we have to filter out SELinux-related xattrs. */ if (strcmp(name, XATTR_NAME_SELINUX) == 0) return 1; /* Discard */ /* * Any other attribute apart from SELINUX is not claimed, supported * by selinux. */ return -EOPNOTSUPP; } /* kernfs node operations */ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { const struct task_security_struct *tsec = selinux_cred(current_cred()); u32 parent_sid, newsid, clen; int rc; char *context; rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, NULL, 0); if (rc == -ENODATA) return 0; else if (rc < 0) return rc; clen = (u32)rc; context = kmalloc(clen, GFP_KERNEL); if (!context) return -ENOMEM; rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, context, clen); if (rc < 0) { kfree(context); return rc; } rc = security_context_to_sid(&selinux_state, context, clen, &parent_sid, GFP_KERNEL); kfree(context); if (rc) return rc; if (tsec->create_sid) { newsid = tsec->create_sid; } else { u16 secclass = inode_mode_to_security_class(kn->mode); struct qstr q; q.name = kn->name; q.hash_len = hashlen_string(kn_dir, kn->name); rc = security_transition_sid(&selinux_state, tsec->sid, parent_sid, secclass, &q, &newsid); if (rc) return rc; } rc = security_sid_to_context_force(&selinux_state, newsid, &context, &clen); if (rc) return rc; rc = kernfs_xattr_set(kn, XATTR_NAME_SELINUX, context, clen, XATTR_CREATE); kfree(context); return rc; } /* file security operations */ static int selinux_revalidate_file_permission(struct file *file, int mask) { const struct cred *cred = current_cred(); struct inode *inode = file_inode(file); /* file_mask_to_av won't add FILE__WRITE if MAY_APPEND is set */ if ((file->f_flags & O_APPEND) && (mask & MAY_WRITE)) mask |= MAY_APPEND; return file_has_perm(cred, file, file_mask_to_av(inode->i_mode, mask)); } static int selinux_file_permission(struct file *file, int mask) { struct inode *inode = file_inode(file); struct file_security_struct *fsec = selinux_file(file); struct inode_security_struct *isec; u32 sid = current_sid(); if (!mask) /* No permission to check. Existence test. */ return 0; isec = inode_security(inode); if (sid == fsec->sid && fsec->isid == isec->sid && fsec->pseqno == avc_policy_seqno(&selinux_state)) /* No change since file_open check. */ return 0; return selinux_revalidate_file_permission(file, mask); } static int selinux_file_alloc_security(struct file *file) { struct file_security_struct *fsec = selinux_file(file); u32 sid = current_sid(); fsec->sid = sid; fsec->fown_sid = sid; return 0; } /* * Check whether a task has the ioctl permission and cmd * operation to an inode. */ static int ioctl_has_perm(const struct cred *cred, struct file *file, u32 requested, u16 cmd) { struct common_audit_data ad; struct file_security_struct *fsec = selinux_file(file); struct inode *inode = file_inode(file); struct inode_security_struct *isec; struct lsm_ioctlop_audit ioctl; u32 ssid = cred_sid(cred); int rc; u8 driver = cmd >> 8; u8 xperm = cmd & 0xff; ad.type = LSM_AUDIT_DATA_IOCTL_OP; ad.u.op = &ioctl; ad.u.op->cmd = cmd; ad.u.op->path = file->f_path; if (ssid != fsec->sid) { rc = avc_has_perm(&selinux_state, ssid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) goto out; } if (unlikely(IS_PRIVATE(inode))) return 0; isec = inode_security(inode); rc = avc_has_extended_perms(&selinux_state, ssid, isec->sid, isec->sclass, requested, driver, xperm, &ad); out: return rc; } static int selinux_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { const struct cred *cred = current_cred(); int error = 0; switch (cmd) { case FIONREAD: case FIBMAP: case FIGETBSZ: case FS_IOC_GETFLAGS: case FS_IOC_GETVERSION: error = file_has_perm(cred, file, FILE__GETATTR); break; case FS_IOC_SETFLAGS: case FS_IOC_SETVERSION: error = file_has_perm(cred, file, FILE__SETATTR); break; /* sys_ioctl() checks */ case FIONBIO: case FIOASYNC: error = file_has_perm(cred, file, 0); break; case KDSKBENT: case KDSKBSENT: error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG, CAP_OPT_NONE, true); break; case FIOCLEX: case FIONCLEX: if (!selinux_policycap_ioctl_skip_cloexec()) error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd); break; /* default case assumes that the command will go * to the file's ioctl() function. */ default: error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd); } return error; } static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { /* * If we are in a 64-bit kernel running 32-bit userspace, we need to * make sure we don't compare 32-bit flags to 64-bit flags. */ switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; case FS_IOC32_SETVERSION: cmd = FS_IOC_SETVERSION; break; default: break; } return selinux_file_ioctl(file, cmd, arg); } static int default_noexec __ro_after_init; static int file_map_prot_check(struct file *file, unsigned long prot, int shared) { const struct cred *cred = current_cred(); u32 sid = cred_sid(cred); int rc = 0; if (default_noexec && (prot & PROT_EXEC) && (!file || IS_PRIVATE(file_inode(file)) || (!shared && (prot & PROT_WRITE)))) { /* * We are making executable an anonymous mapping or a * private file mapping that will also be writable. * This has an additional check. */ rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__EXECMEM, NULL); if (rc) goto error; } if (file) { /* read access is always possible with a mapping */ u32 av = FILE__READ; /* write access only matters if the mapping is shared */ if (shared && (prot & PROT_WRITE)) av |= FILE__WRITE; if (prot & PROT_EXEC) av |= FILE__EXECUTE; return file_has_perm(cred, file, av); } error: return rc; } static int selinux_mmap_addr(unsigned long addr) { int rc = 0; if (addr < CONFIG_LSM_MMAP_MIN_ADDR) { u32 sid = current_sid(); rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, NULL); } return rc; } static int selinux_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { struct common_audit_data ad; int rc; if (file) { ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; rc = inode_has_perm(current_cred(), file_inode(file), FILE__MAP, &ad); if (rc) return rc; } if (checkreqprot_get(&selinux_state)) prot = reqprot; return file_map_prot_check(file, prot, (flags & MAP_TYPE) == MAP_SHARED); } static int selinux_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { const struct cred *cred = current_cred(); u32 sid = cred_sid(cred); if (checkreqprot_get(&selinux_state)) prot = reqprot; if (default_noexec && (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) { int rc = 0; if (vma->vm_start >= vma->vm_mm->start_brk && vma->vm_end <= vma->vm_mm->brk) { rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__EXECHEAP, NULL); } else if (!vma->vm_file && ((vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) || vma_is_stack_for_current(vma))) { rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__EXECSTACK, NULL); } else if (vma->vm_file && vma->anon_vma) { /* * We are making executable a file mapping that has * had some COW done. Since pages might have been * written, check ability to execute the possibly * modified content. This typically should only * occur for text relocations. */ rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD); } if (rc) return rc; } return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED); } static int selinux_file_lock(struct file *file, unsigned int cmd) { const struct cred *cred = current_cred(); return file_has_perm(cred, file, FILE__LOCK); } static int selinux_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { const struct cred *cred = current_cred(); int err = 0; switch (cmd) { case F_SETFL: if ((file->f_flags & O_APPEND) && !(arg & O_APPEND)) { err = file_has_perm(cred, file, FILE__WRITE); break; } fallthrough; case F_SETOWN: case F_SETSIG: case F_GETFL: case F_GETOWN: case F_GETSIG: case F_GETOWNER_UIDS: /* Just check FD__USE permission */ err = file_has_perm(cred, file, 0); break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: #if BITS_PER_LONG == 32 case F_GETLK64: case F_SETLK64: case F_SETLKW64: #endif err = file_has_perm(cred, file, FILE__LOCK); break; } return err; } static void selinux_file_set_fowner(struct file *file) { struct file_security_struct *fsec; fsec = selinux_file(file); fsec->fown_sid = current_sid(); } static int selinux_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int signum) { struct file *file; u32 sid = task_sid(tsk); u32 perm; struct file_security_struct *fsec; /* struct fown_struct is never outside the context of a struct file */ file = container_of(fown, struct file, f_owner); fsec = selinux_file(file); if (!signum) perm = signal_to_av(SIGIO); /* as per send_sigio_to_task */ else perm = signal_to_av(signum); return avc_has_perm(&selinux_state, fsec->fown_sid, sid, SECCLASS_PROCESS, perm, NULL); } static int selinux_file_receive(struct file *file) { const struct cred *cred = current_cred(); return file_has_perm(cred, file, file_to_av(file)); } static int selinux_file_open(struct file *file) { struct file_security_struct *fsec; struct inode_security_struct *isec; fsec = selinux_file(file); isec = inode_security(file_inode(file)); /* * Save inode label and policy sequence number * at open-time so that selinux_file_permission * can determine whether revalidation is necessary. * Task label is already saved in the file security * struct as its SID. */ fsec->isid = isec->sid; fsec->pseqno = avc_policy_seqno(&selinux_state); /* * Since the inode label or policy seqno may have changed * between the selinux_inode_permission check and the saving * of state above, recheck that access is still permitted. * Otherwise, access might never be revalidated against the * new inode label or new policy. * This check is not redundant - do not remove. */ return file_path_has_perm(file->f_cred, file, open_file_to_av(file)); } /* task security operations */ static int selinux_task_alloc(struct task_struct *task, unsigned long clone_flags) { u32 sid = current_sid(); return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL); } /* * prepare a new set of credentials for modification */ static int selinux_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { const struct task_security_struct *old_tsec = selinux_cred(old); struct task_security_struct *tsec = selinux_cred(new); *tsec = *old_tsec; return 0; } /* * transfer the SELinux data to a blank set of creds */ static void selinux_cred_transfer(struct cred *new, const struct cred *old) { const struct task_security_struct *old_tsec = selinux_cred(old); struct task_security_struct *tsec = selinux_cred(new); *tsec = *old_tsec; } static void selinux_cred_getsecid(const struct cred *c, u32 *secid) { *secid = cred_sid(c); } /* * set the security data for a kernel service * - all the creation contexts are set to unlabelled */ static int selinux_kernel_act_as(struct cred *new, u32 secid) { struct task_security_struct *tsec = selinux_cred(new); u32 sid = current_sid(); int ret; ret = avc_has_perm(&selinux_state, sid, secid, SECCLASS_KERNEL_SERVICE, KERNEL_SERVICE__USE_AS_OVERRIDE, NULL); if (ret == 0) { tsec->sid = secid; tsec->create_sid = 0; tsec->keycreate_sid = 0; tsec->sockcreate_sid = 0; } return ret; } /* * set the file creation context in a security record to the same as the * objective context of the specified inode */ static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) { struct inode_security_struct *isec = inode_security(inode); struct task_security_struct *tsec = selinux_cred(new); u32 sid = current_sid(); int ret; ret = avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_KERNEL_SERVICE, KERNEL_SERVICE__CREATE_FILES_AS, NULL); if (ret == 0) tsec->create_sid = isec->sid; return ret; } static int selinux_kernel_module_request(char *kmod_name) { struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_KMOD; ad.u.kmod_name = kmod_name; return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__MODULE_REQUEST, &ad); } static int selinux_kernel_module_from_file(struct file *file) { struct common_audit_data ad; struct inode_security_struct *isec; struct file_security_struct *fsec; u32 sid = current_sid(); int rc; /* init_module */ if (file == NULL) return avc_has_perm(&selinux_state, sid, sid, SECCLASS_SYSTEM, SYSTEM__MODULE_LOAD, NULL); /* finit_module */ ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; fsec = selinux_file(file); if (sid != fsec->sid) { rc = avc_has_perm(&selinux_state, sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) return rc; } isec = inode_security(file_inode(file)); return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_SYSTEM, SYSTEM__MODULE_LOAD, &ad); } static int selinux_kernel_read_file(struct file *file, enum kernel_read_file_id id, bool contents) { int rc = 0; switch (id) { case READING_MODULE: rc = selinux_kernel_module_from_file(contents ? file : NULL); break; default: break; } return rc; } static int selinux_kernel_load_data(enum kernel_load_data_id id, bool contents) { int rc = 0; switch (id) { case LOADING_MODULE: rc = selinux_kernel_module_from_file(NULL); default: break; } return rc; } static int selinux_task_setpgid(struct task_struct *p, pid_t pgid) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETPGID, NULL); } static int selinux_task_getpgid(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__GETPGID, NULL); } static int selinux_task_getsid(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__GETSESSION, NULL); } static void selinux_task_getsecid(struct task_struct *p, u32 *secid) { *secid = task_sid(p); } static int selinux_task_setnice(struct task_struct *p, int nice) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_setioprio(struct task_struct *p, int ioprio) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_getioprio(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__GETSCHED, NULL); } static int selinux_task_prlimit(const struct cred *cred, const struct cred *tcred, unsigned int flags) { u32 av = 0; if (!flags) return 0; if (flags & LSM_PRLIMIT_WRITE) av |= PROCESS__SETRLIMIT; if (flags & LSM_PRLIMIT_READ) av |= PROCESS__GETRLIMIT; return avc_has_perm(&selinux_state, cred_sid(cred), cred_sid(tcred), SECCLASS_PROCESS, av, NULL); } static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) { struct rlimit *old_rlim = p->signal->rlim + resource; /* Control the ability to change the hard limit (whether lowering or raising it), so that the hard limit can later be used as a safe reset point for the soft limit upon context transitions. See selinux_bprm_committing_creds. */ if (old_rlim->rlim_max != new_rlim->rlim_max) return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETRLIMIT, NULL); return 0; } static int selinux_task_setscheduler(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_getscheduler(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__GETSCHED, NULL); } static int selinux_task_movememory(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { u32 secid; u32 perm; if (!sig) perm = PROCESS__SIGNULL; /* null signal; existence test */ else perm = signal_to_av(sig); if (!cred) secid = current_sid(); else secid = cred_sid(cred); return avc_has_perm(&selinux_state, secid, task_sid(p), SECCLASS_PROCESS, perm, NULL); } static void selinux_task_to_inode(struct task_struct *p, struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); u32 sid = task_sid(p); spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = sid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); } /* Returns error only if unable to parse addresses */ static int selinux_parse_skb_ipv4(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int offset, ihlen, ret = -EINVAL; struct iphdr _iph, *ih; offset = skb_network_offset(skb); ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) goto out; ihlen = ih->ihl * 4; if (ihlen < sizeof(_iph)) goto out; ad->u.net->v4info.saddr = ih->saddr; ad->u.net->v4info.daddr = ih->daddr; ret = 0; if (proto) *proto = ih->protocol; switch (ih->protocol) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr _dccph, *dh; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } #endif default: break; } out: return ret; } #if IS_ENABLED(CONFIG_IPV6) /* Returns error only if unable to parse addresses */ static int selinux_parse_skb_ipv6(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { u8 nexthdr; int ret = -EINVAL, offset; struct ipv6hdr _ipv6h, *ip6; __be16 frag_off; offset = skb_network_offset(skb); ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (ip6 == NULL) goto out; ad->u.net->v6info.saddr = ip6->saddr; ad->u.net->v6info.daddr = ip6->daddr; ret = 0; nexthdr = ip6->nexthdr; offset += sizeof(_ipv6h); offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) goto out; if (proto) *proto = nexthdr; switch (nexthdr) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr _dccph, *dh; dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } #endif /* includes fragments */ default: break; } out: return ret; } #endif /* IPV6 */ static int selinux_parse_skb(struct sk_buff *skb, struct common_audit_data *ad, char **_addrp, int src, u8 *proto) { char *addrp; int ret; switch (ad->u.net->family) { case PF_INET: ret = selinux_parse_skb_ipv4(skb, ad, proto); if (ret) goto parse_error; addrp = (char *)(src ? &ad->u.net->v4info.saddr : &ad->u.net->v4info.daddr); goto okay; #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: ret = selinux_parse_skb_ipv6(skb, ad, proto); if (ret) goto parse_error; addrp = (char *)(src ? &ad->u.net->v6info.saddr : &ad->u.net->v6info.daddr); goto okay; #endif /* IPV6 */ default: addrp = NULL; goto okay; } parse_error: pr_warn( "SELinux: failure in selinux_parse_skb()," " unable to parse packet\n"); return ret; okay: if (_addrp) *_addrp = addrp; return 0; } /** * selinux_skb_peerlbl_sid - Determine the peer label of a packet * @skb: the packet * @family: protocol family * @sid: the packet's peer label SID * * Description: * Check the various different forms of network peer labeling and determine * the peer label/SID for the packet; most of the magic actually occurs in * the security server function security_net_peersid_cmp(). The function * returns zero if the value in @sid is valid (although it may be SECSID_NULL) * or -EACCES if @sid is invalid due to inconsistencies with the different * peer labels. * */ static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid) { int err; u32 xfrm_sid; u32 nlbl_sid; u32 nlbl_type; err = selinux_xfrm_skb_sid(skb, &xfrm_sid); if (unlikely(err)) return -EACCES; err = selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid); if (unlikely(err)) return -EACCES; err = security_net_peersid_resolve(&selinux_state, nlbl_sid, nlbl_type, xfrm_sid, sid); if (unlikely(err)) { pr_warn( "SELinux: failure in selinux_skb_peerlbl_sid()," " unable to determine packet's peer label\n"); return -EACCES; } return 0; } /** * selinux_conn_sid - Determine the child socket label for a connection * @sk_sid: the parent socket's SID * @skb_sid: the packet's SID * @conn_sid: the resulting connection SID * * If @skb_sid is valid then the user:role:type information from @sk_sid is * combined with the MLS information from @skb_sid in order to create * @conn_sid. If @skb_sid is not valid then @conn_sid is simply a copy * of @sk_sid. Returns zero on success, negative values on failure. * */ static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid) { int err = 0; if (skb_sid != SECSID_NULL) err = security_sid_mls_copy(&selinux_state, sk_sid, skb_sid, conn_sid); else *conn_sid = sk_sid; return err; } /* socket security operations */ static int socket_sockcreate_sid(const struct task_security_struct *tsec, u16 secclass, u32 *socksid) { if (tsec->sockcreate_sid > SECSID_NULL) { *socksid = tsec->sockcreate_sid; return 0; } return security_transition_sid(&selinux_state, tsec->sid, tsec->sid, secclass, NULL, socksid); } static int sock_has_perm(struct sock *sk, u32 perms) { struct sk_security_struct *sksec = sk->sk_security; struct common_audit_data ad; struct lsm_network_audit net = {0,}; if (sksec->sid == SECINITSID_KERNEL) return 0; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sk = sk; return avc_has_perm(&selinux_state, current_sid(), sksec->sid, sksec->sclass, perms, &ad); } static int selinux_socket_create(int family, int type, int protocol, int kern) { const struct task_security_struct *tsec = selinux_cred(current_cred()); u32 newsid; u16 secclass; int rc; if (kern) return 0; secclass = socket_type_to_security_class(family, type, protocol); rc = socket_sockcreate_sid(tsec, secclass, &newsid); if (rc) return rc; return avc_has_perm(&selinux_state, tsec->sid, newsid, secclass, SOCKET__CREATE, NULL); } static int selinux_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { const struct task_security_struct *tsec = selinux_cred(current_cred()); struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock)); struct sk_security_struct *sksec; u16 sclass = socket_type_to_security_class(family, type, protocol); u32 sid = SECINITSID_KERNEL; int err = 0; if (!kern) { err = socket_sockcreate_sid(tsec, sclass, &sid); if (err) return err; } isec->sclass = sclass; isec->sid = sid; isec->initialized = LABEL_INITIALIZED; if (sock->sk) { sksec = sock->sk->sk_security; sksec->sclass = sclass; sksec->sid = sid; /* Allows detection of the first association on this socket */ if (sksec->sclass == SECCLASS_SCTP_SOCKET) sksec->sctp_assoc_state = SCTP_ASSOC_UNSET; err = selinux_netlbl_socket_post_create(sock->sk, family); } return err; } static int selinux_socket_socketpair(struct socket *socka, struct socket *sockb) { struct sk_security_struct *sksec_a = socka->sk->sk_security; struct sk_security_struct *sksec_b = sockb->sk->sk_security; sksec_a->peer_sid = sksec_b->sid; sksec_b->peer_sid = sksec_a->sid; return 0; } /* Range of port numbers used to automatically bind. Need to determine whether we should perform a name_bind permission check between the socket and the port number. */ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { struct sock *sk = sock->sk; struct sk_security_struct *sksec = sk->sk_security; u16 family; int err; err = sock_has_perm(sk, SOCKET__BIND); if (err) goto out; /* If PF_INET or PF_INET6, check name_bind permission for the port. */ family = sk->sk_family; if (family == PF_INET || family == PF_INET6) { char *addrp; struct common_audit_data ad; struct lsm_network_audit net = {0,}; struct sockaddr_in *addr4 = NULL; struct sockaddr_in6 *addr6 = NULL; u16 family_sa; unsigned short snum; u32 sid, node_perm; /* * sctp_bindx(3) calls via selinux_sctp_bind_connect() * that validates multiple binding addresses. Because of this * need to check address->sa_family as it is possible to have * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET. */ if (addrlen < offsetofend(struct sockaddr, sa_family)) return -EINVAL; family_sa = address->sa_family; switch (family_sa) { case AF_UNSPEC: case AF_INET: if (addrlen < sizeof(struct sockaddr_in)) return -EINVAL; addr4 = (struct sockaddr_in *)address; if (family_sa == AF_UNSPEC) { if (family == PF_INET6) { /* Length check from inet6_bind_sk() */ if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; /* Family check from __inet6_bind() */ goto err_af; } /* see __inet_bind(), we only want to allow * AF_UNSPEC if the address is INADDR_ANY */ if (addr4->sin_addr.s_addr != htonl(INADDR_ANY)) goto err_af; family_sa = AF_INET; } snum = ntohs(addr4->sin_port); addrp = (char *)&addr4->sin_addr.s_addr; break; case AF_INET6: if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; addr6 = (struct sockaddr_in6 *)address; snum = ntohs(addr6->sin6_port); addrp = (char *)&addr6->sin6_addr.s6_addr; break; default: goto err_af; } ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sport = htons(snum); ad.u.net->family = family_sa; if (snum) { int low, high; inet_get_local_port_range(sock_net(sk), &low, &high); if (inet_port_requires_bind_service(sock_net(sk), snum) || snum < low || snum > high) { err = sel_netport_sid(sk->sk_protocol, snum, &sid); if (err) goto out; err = avc_has_perm(&selinux_state, sksec->sid, sid, sksec->sclass, SOCKET__NAME_BIND, &ad); if (err) goto out; } } switch (sksec->sclass) { case SECCLASS_TCP_SOCKET: node_perm = TCP_SOCKET__NODE_BIND; break; case SECCLASS_UDP_SOCKET: node_perm = UDP_SOCKET__NODE_BIND; break; case SECCLASS_DCCP_SOCKET: node_perm = DCCP_SOCKET__NODE_BIND; break; case SECCLASS_SCTP_SOCKET: node_perm = SCTP_SOCKET__NODE_BIND; break; default: node_perm = RAWIP_SOCKET__NODE_BIND; break; } err = sel_netnode_sid(addrp, family_sa, &sid); if (err) goto out; if (family_sa == AF_INET) ad.u.net->v4info.saddr = addr4->sin_addr.s_addr; else ad.u.net->v6info.saddr = addr6->sin6_addr; err = avc_has_perm(&selinux_state, sksec->sid, sid, sksec->sclass, node_perm, &ad); if (err) goto out; } out: return err; err_af: /* Note that SCTP services expect -EINVAL, others -EAFNOSUPPORT. */ if (sksec->sclass == SECCLASS_SCTP_SOCKET) return -EINVAL; return -EAFNOSUPPORT; } /* This supports connect(2) and SCTP connect services such as sctp_connectx(3) * and sctp_sendmsg(3) as described in Documentation/security/SCTP.rst */ static int selinux_socket_connect_helper(struct socket *sock, struct sockaddr *address, int addrlen) { struct sock *sk = sock->sk; struct sk_security_struct *sksec = sk->sk_security; int err; err = sock_has_perm(sk, SOCKET__CONNECT); if (err) return err; if (addrlen < offsetofend(struct sockaddr, sa_family)) return -EINVAL; /* connect(AF_UNSPEC) has special handling, as it is a documented * way to disconnect the socket */ if (address->sa_family == AF_UNSPEC) return 0; /* * If a TCP, DCCP or SCTP socket, check name_connect permission * for the port. */ if (sksec->sclass == SECCLASS_TCP_SOCKET || sksec->sclass == SECCLASS_DCCP_SOCKET || sksec->sclass == SECCLASS_SCTP_SOCKET) { struct common_audit_data ad; struct lsm_network_audit net = {0,}; struct sockaddr_in *addr4 = NULL; struct sockaddr_in6 *addr6 = NULL; unsigned short snum; u32 sid, perm; /* sctp_connectx(3) calls via selinux_sctp_bind_connect() * that validates multiple connect addresses. Because of this * need to check address->sa_family as it is possible to have * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET. */ switch (address->sa_family) { case AF_INET: addr4 = (struct sockaddr_in *)address; if (addrlen < sizeof(struct sockaddr_in)) return -EINVAL; snum = ntohs(addr4->sin_port); break; case AF_INET6: addr6 = (struct sockaddr_in6 *)address; if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; snum = ntohs(addr6->sin6_port); break; default: /* Note that SCTP services expect -EINVAL, whereas * others expect -EAFNOSUPPORT. */ if (sksec->sclass == SECCLASS_SCTP_SOCKET) return -EINVAL; else return -EAFNOSUPPORT; } err = sel_netport_sid(sk->sk_protocol, snum, &sid); if (err) return err; switch (sksec->sclass) { case SECCLASS_TCP_SOCKET: perm = TCP_SOCKET__NAME_CONNECT; break; case SECCLASS_DCCP_SOCKET: perm = DCCP_SOCKET__NAME_CONNECT; break; case SECCLASS_SCTP_SOCKET: perm = SCTP_SOCKET__NAME_CONNECT; break; } ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->dport = htons(snum); ad.u.net->family = address->sa_family; err = avc_has_perm(&selinux_state, sksec->sid, sid, sksec->sclass, perm, &ad); if (err) return err; } return 0; } /* Supports connect(2), see comments in selinux_socket_connect_helper() */ static int selinux_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { int err; struct sock *sk = sock->sk; err = selinux_socket_connect_helper(sock, address, addrlen); if (err) return err; return selinux_netlbl_socket_connect(sk, address); } static int selinux_socket_listen(struct socket *sock, int backlog) { return sock_has_perm(sock->sk, SOCKET__LISTEN); } static int selinux_socket_accept(struct socket *sock, struct socket *newsock) { int err; struct inode_security_struct *isec; struct inode_security_struct *newisec; u16 sclass; u32 sid; err = sock_has_perm(sock->sk, SOCKET__ACCEPT); if (err) return err; isec = inode_security_novalidate(SOCK_INODE(sock)); spin_lock(&isec->lock); sclass = isec->sclass; sid = isec->sid; spin_unlock(&isec->lock); newisec = inode_security_novalidate(SOCK_INODE(newsock)); newisec->sclass = sclass; newisec->sid = sid; newisec->initialized = LABEL_INITIALIZED; return 0; } static int selinux_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return sock_has_perm(sock->sk, SOCKET__WRITE); } static int selinux_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return sock_has_perm(sock->sk, SOCKET__READ); } static int selinux_socket_getsockname(struct socket *sock) { return sock_has_perm(sock->sk, SOCKET__GETATTR); } static int selinux_socket_getpeername(struct socket *sock) { return sock_has_perm(sock->sk, SOCKET__GETATTR); } static int selinux_socket_setsockopt(struct socket *sock, int level, int optname) { int err; err = sock_has_perm(sock->sk, SOCKET__SETOPT); if (err) return err; return selinux_netlbl_socket_setsockopt(sock, level, optname); } static int selinux_socket_getsockopt(struct socket *sock, int level, int optname) { return sock_has_perm(sock->sk, SOCKET__GETOPT); } static int selinux_socket_shutdown(struct socket *sock, int how) { return sock_has_perm(sock->sk, SOCKET__SHUTDOWN); } static int selinux_socket_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) { struct sk_security_struct *sksec_sock = sock->sk_security; struct sk_security_struct *sksec_other = other->sk_security; struct sk_security_struct *sksec_new = newsk->sk_security; struct common_audit_data ad; struct lsm_network_audit net = {0,}; int err; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sk = other; err = avc_has_perm(&selinux_state, sksec_sock->sid, sksec_other->sid, sksec_other->sclass, UNIX_STREAM_SOCKET__CONNECTTO, &ad); if (err) return err; /* server child socket */ sksec_new->peer_sid = sksec_sock->sid; err = security_sid_mls_copy(&selinux_state, sksec_other->sid, sksec_sock->sid, &sksec_new->sid); if (err) return err; /* connecting socket */ sksec_sock->peer_sid = sksec_new->sid; return 0; } static int selinux_socket_unix_may_send(struct socket *sock, struct socket *other) { struct sk_security_struct *ssec = sock->sk->sk_security; struct sk_security_struct *osec = other->sk->sk_security; struct common_audit_data ad; struct lsm_network_audit net = {0,}; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sk = other->sk; return avc_has_perm(&selinux_state, ssec->sid, osec->sid, osec->sclass, SOCKET__SENDTO, &ad); } static int selinux_inet_sys_rcv_skb(struct net *ns, int ifindex, char *addrp, u16 family, u32 peer_sid, struct common_audit_data *ad) { int err; u32 if_sid; u32 node_sid; err = sel_netif_sid(ns, ifindex, &if_sid); if (err) return err; err = avc_has_perm(&selinux_state, peer_sid, if_sid, SECCLASS_NETIF, NETIF__INGRESS, ad); if (err) return err; err = sel_netnode_sid(addrp, family, &node_sid); if (err) return err; return avc_has_perm(&selinux_state, peer_sid, node_sid, SECCLASS_NODE, NODE__RECVFROM, ad); } static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, u16 family) { int err = 0; struct sk_security_struct *sksec = sk->sk_security; u32 sk_sid = sksec->sid; struct common_audit_data ad; struct lsm_network_audit net = {0,}; char *addrp; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->netif = skb->skb_iif; ad.u.net->family = family; err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); if (err) return err; if (selinux_secmark_enabled()) { err = avc_has_perm(&selinux_state, sk_sid, skb->secmark, SECCLASS_PACKET, PACKET__RECV, &ad); if (err) return err; } err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, &ad); if (err) return err; err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad); return err; } static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; struct sk_security_struct *sksec = sk->sk_security; u16 family = sk->sk_family; u32 sk_sid = sksec->sid; struct common_audit_data ad; struct lsm_network_audit net = {0,}; char *addrp; u8 secmark_active; u8 peerlbl_active; if (family != PF_INET && family != PF_INET6) return 0; /* Handle mapped IPv4 packets arriving via IPv6 sockets */ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; /* If any sort of compatibility mode is enabled then handoff processing * to the selinux_sock_rcv_skb_compat() function to deal with the * special handling. We do this in an attempt to keep this function * as fast and as clean as possible. */ if (!selinux_policycap_netpeer()) return selinux_sock_rcv_skb_compat(sk, skb, family); secmark_active = selinux_secmark_enabled(); peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return 0; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->netif = skb->skb_iif; ad.u.net->family = family; err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); if (err) return err; if (peerlbl_active) { u32 peer_sid; err = selinux_skb_peerlbl_sid(skb, family, &peer_sid); if (err) return err; err = selinux_inet_sys_rcv_skb(sock_net(sk), skb->skb_iif, addrp, family, peer_sid, &ad); if (err) { selinux_netlbl_err(skb, family, err, 0); return err; } err = avc_has_perm(&selinux_state, sk_sid, peer_sid, SECCLASS_PEER, PEER__RECV, &ad); if (err) { selinux_netlbl_err(skb, family, err, 0); return err; } } if (secmark_active) { err = avc_has_perm(&selinux_state, sk_sid, skb->secmark, SECCLASS_PACKET, PACKET__RECV, &ad); if (err) return err; } return err; } static int selinux_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { int err = 0; char *scontext = NULL; u32 scontext_len; struct sk_security_struct *sksec = sock->sk->sk_security; u32 peer_sid = SECSID_NULL; if (sksec->sclass == SECCLASS_UNIX_STREAM_SOCKET || sksec->sclass == SECCLASS_TCP_SOCKET || sksec->sclass == SECCLASS_SCTP_SOCKET) peer_sid = sksec->peer_sid; if (peer_sid == SECSID_NULL) return -ENOPROTOOPT; err = security_sid_to_context(&selinux_state, peer_sid, &scontext, &scontext_len); if (err) return err; if (scontext_len > len) { err = -ERANGE; goto out_len; } if (copy_to_sockptr(optval, scontext, scontext_len)) err = -EFAULT; out_len: if (copy_to_sockptr(optlen, &scontext_len, sizeof(scontext_len))) err = -EFAULT; kfree(scontext); return err; } static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { u32 peer_secid = SECSID_NULL; u16 family; struct inode_security_struct *isec; if (skb && skb->protocol == htons(ETH_P_IP)) family = PF_INET; else if (skb && skb->protocol == htons(ETH_P_IPV6)) family = PF_INET6; else if (sock) family = sock->sk->sk_family; else goto out; if (sock && family == PF_UNIX) { isec = inode_security_novalidate(SOCK_INODE(sock)); peer_secid = isec->sid; } else if (skb) selinux_skb_peerlbl_sid(skb, family, &peer_secid); out: *secid = peer_secid; if (peer_secid == SECSID_NULL) return -EINVAL; return 0; } static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority) { struct sk_security_struct *sksec; sksec = kzalloc(sizeof(*sksec), priority); if (!sksec) return -ENOMEM; sksec->peer_sid = SECINITSID_UNLABELED; sksec->sid = SECINITSID_UNLABELED; sksec->sclass = SECCLASS_SOCKET; selinux_netlbl_sk_security_reset(sksec); sk->sk_security = sksec; return 0; } static void selinux_sk_free_security(struct sock *sk) { struct sk_security_struct *sksec = sk->sk_security; sk->sk_security = NULL; selinux_netlbl_sk_security_free(sksec); kfree(sksec); } static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk) { struct sk_security_struct *sksec = sk->sk_security; struct sk_security_struct *newsksec = newsk->sk_security; newsksec->sid = sksec->sid; newsksec->peer_sid = sksec->peer_sid; newsksec->sclass = sksec->sclass; selinux_netlbl_sk_security_reset(newsksec); } static void selinux_sk_getsecid(struct sock *sk, u32 *secid) { if (!sk) *secid = SECINITSID_ANY_SOCKET; else { struct sk_security_struct *sksec = sk->sk_security; *secid = sksec->sid; } } static void selinux_sock_graft(struct sock *sk, struct socket *parent) { struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(parent)); struct sk_security_struct *sksec = sk->sk_security; if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 || sk->sk_family == PF_UNIX) isec->sid = sksec->sid; sksec->sclass = isec->sclass; } /* Called whenever SCTP receives an INIT chunk. This happens when an incoming * connect(2), sctp_connectx(3) or sctp_sendmsg(3) (with no association * already present). */ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb) { struct sk_security_struct *sksec = ep->base.sk->sk_security; struct common_audit_data ad; struct lsm_network_audit net = {0,}; u8 peerlbl_active; u32 peer_sid = SECINITSID_UNLABELED; u32 conn_sid; int err = 0; if (!selinux_policycap_extsockclass()) return 0; peerlbl_active = selinux_peerlbl_enabled(); if (peerlbl_active) { /* This will return peer_sid = SECSID_NULL if there are * no peer labels, see security_net_peersid_resolve(). */ err = selinux_skb_peerlbl_sid(skb, ep->base.sk->sk_family, &peer_sid); if (err) return err; if (peer_sid == SECSID_NULL) peer_sid = SECINITSID_UNLABELED; } if (sksec->sctp_assoc_state == SCTP_ASSOC_UNSET) { sksec->sctp_assoc_state = SCTP_ASSOC_SET; /* Here as first association on socket. As the peer SID * was allowed by peer recv (and the netif/node checks), * then it is approved by policy and used as the primary * peer SID for getpeercon(3). */ sksec->peer_sid = peer_sid; } else if (sksec->peer_sid != peer_sid) { /* Other association peer SIDs are checked to enforce * consistency among the peer SIDs. */ ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sk = ep->base.sk; err = avc_has_perm(&selinux_state, sksec->peer_sid, peer_sid, sksec->sclass, SCTP_SOCKET__ASSOCIATION, &ad); if (err) return err; } /* Compute the MLS component for the connection and store * the information in ep. This will be used by SCTP TCP type * sockets and peeled off connections as they cause a new * socket to be generated. selinux_sctp_sk_clone() will then * plug this into the new socket. */ err = selinux_conn_sid(sksec->sid, peer_sid, &conn_sid); if (err) return err; ep->secid = conn_sid; ep->peer_secid = peer_sid; /* Set any NetLabel labels including CIPSO/CALIPSO options. */ return selinux_netlbl_sctp_assoc_request(ep, skb); } /* Check if sctp IPv4/IPv6 addresses are valid for binding or connecting * based on their @optname. */ static int selinux_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen) { int len, err = 0, walk_size = 0; void *addr_buf; struct sockaddr *addr; struct socket *sock; if (!selinux_policycap_extsockclass()) return 0; /* Process one or more addresses that may be IPv4 or IPv6 */ sock = sk->sk_socket; addr_buf = address; while (walk_size < addrlen) { if (walk_size + sizeof(sa_family_t) > addrlen) return -EINVAL; addr = addr_buf; switch (addr->sa_family) { case AF_UNSPEC: case AF_INET: len = sizeof(struct sockaddr_in); break; case AF_INET6: len = sizeof(struct sockaddr_in6); break; default: return -EINVAL; } if (walk_size + len > addrlen) return -EINVAL; err = -EINVAL; switch (optname) { /* Bind checks */ case SCTP_PRIMARY_ADDR: case SCTP_SET_PEER_PRIMARY_ADDR: case SCTP_SOCKOPT_BINDX_ADD: err = selinux_socket_bind(sock, addr, len); break; /* Connect checks */ case SCTP_SOCKOPT_CONNECTX: case SCTP_PARAM_SET_PRIMARY: case SCTP_PARAM_ADD_IP: case SCTP_SENDMSG_CONNECT: err = selinux_socket_connect_helper(sock, addr, len); if (err) return err; /* As selinux_sctp_bind_connect() is called by the * SCTP protocol layer, the socket is already locked, * therefore selinux_netlbl_socket_connect_locked() * is called here. The situations handled are: * sctp_connectx(3), sctp_sendmsg(3), sendmsg(2), * whenever a new IP address is added or when a new * primary address is selected. * Note that an SCTP connect(2) call happens before * the SCTP protocol layer and is handled via * selinux_socket_connect(). */ err = selinux_netlbl_socket_connect_locked(sk, addr); break; } if (err) return err; addr_buf += len; walk_size += len; } return 0; } /* Called whenever a new socket is created by accept(2) or sctp_peeloff(3). */ static void selinux_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk, struct sock *newsk) { struct sk_security_struct *sksec = sk->sk_security; struct sk_security_struct *newsksec = newsk->sk_security; /* If policy does not support SECCLASS_SCTP_SOCKET then call * the non-sctp clone version. */ if (!selinux_policycap_extsockclass()) return selinux_sk_clone_security(sk, newsk); newsksec->sid = ep->secid; newsksec->peer_sid = ep->peer_secid; newsksec->sclass = sksec->sclass; selinux_netlbl_sctp_sk_clone(sk, newsk); } static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct sk_security_struct *sksec = sk->sk_security; int err; u16 family = req->rsk_ops->family; u32 connsid; u32 peersid; err = selinux_skb_peerlbl_sid(skb, family, &peersid); if (err) return err; err = selinux_conn_sid(sksec->sid, peersid, &connsid); if (err) return err; req->secid = connsid; req->peer_secid = peersid; return selinux_netlbl_inet_conn_request(req, family); } static void selinux_inet_csk_clone(struct sock *newsk, const struct request_sock *req) { struct sk_security_struct *newsksec = newsk->sk_security; newsksec->sid = req->secid; newsksec->peer_sid = req->peer_secid; /* NOTE: Ideally, we should also get the isec->sid for the new socket in sync, but we don't have the isec available yet. So we will wait until sock_graft to do it, by which time it will have been created and available. */ /* We don't need to take any sort of lock here as we are the only * thread with access to newsksec */ selinux_netlbl_inet_csk_clone(newsk, req->rsk_ops->family); } static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb) { u16 family = sk->sk_family; struct sk_security_struct *sksec = sk->sk_security; /* handle mapped IPv4 packets arriving via IPv6 sockets */ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid); } static int selinux_secmark_relabel_packet(u32 sid) { const struct task_security_struct *__tsec; u32 tsid; __tsec = selinux_cred(current_cred()); tsid = __tsec->sid; return avc_has_perm(&selinux_state, tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL); } static void selinux_secmark_refcount_inc(void) { atomic_inc(&selinux_secmark_refcount); } static void selinux_secmark_refcount_dec(void) { atomic_dec(&selinux_secmark_refcount); } static void selinux_req_classify_flow(const struct request_sock *req, struct flowi_common *flic) { flic->flowic_secid = req->secid; } static int selinux_tun_dev_alloc_security(void **security) { struct tun_security_struct *tunsec; tunsec = kzalloc(sizeof(*tunsec), GFP_KERNEL); if (!tunsec) return -ENOMEM; tunsec->sid = current_sid(); *security = tunsec; return 0; } static void selinux_tun_dev_free_security(void *security) { kfree(security); } static int selinux_tun_dev_create(void) { u32 sid = current_sid(); /* we aren't taking into account the "sockcreate" SID since the socket * that is being created here is not a socket in the traditional sense, * instead it is a private sock, accessible only to the kernel, and * representing a wide range of network traffic spanning multiple * connections unlike traditional sockets - check the TUN driver to * get a better understanding of why this socket is special */ return avc_has_perm(&selinux_state, sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__CREATE, NULL); } static int selinux_tun_dev_attach_queue(void *security) { struct tun_security_struct *tunsec = security; return avc_has_perm(&selinux_state, current_sid(), tunsec->sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__ATTACH_QUEUE, NULL); } static int selinux_tun_dev_attach(struct sock *sk, void *security) { struct tun_security_struct *tunsec = security; struct sk_security_struct *sksec = sk->sk_security; /* we don't currently perform any NetLabel based labeling here and it * isn't clear that we would want to do so anyway; while we could apply * labeling without the support of the TUN user the resulting labeled * traffic from the other end of the connection would almost certainly * cause confusion to the TUN user that had no idea network labeling * protocols were being used */ sksec->sid = tunsec->sid; sksec->sclass = SECCLASS_TUN_SOCKET; return 0; } static int selinux_tun_dev_open(void *security) { struct tun_security_struct *tunsec = security; u32 sid = current_sid(); int err; err = avc_has_perm(&selinux_state, sid, tunsec->sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__RELABELFROM, NULL); if (err) return err; err = avc_has_perm(&selinux_state, sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__RELABELTO, NULL); if (err) return err; tunsec->sid = sid; return 0; } #ifdef CONFIG_NETFILTER static unsigned int selinux_ip_forward(struct sk_buff *skb, const struct net_device *indev, u16 family) { int err; char *addrp; u32 peer_sid; struct common_audit_data ad; struct lsm_network_audit net = {0,}; u8 secmark_active; u8 netlbl_active; u8 peerlbl_active; if (!selinux_policycap_netpeer()) return NF_ACCEPT; secmark_active = selinux_secmark_enabled(); netlbl_active = netlbl_enabled(); peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return NF_ACCEPT; if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0) return NF_DROP; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->netif = indev->ifindex; ad.u.net->family = family; if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0) return NF_DROP; if (peerlbl_active) { err = selinux_inet_sys_rcv_skb(dev_net(indev), indev->ifindex, addrp, family, peer_sid, &ad); if (err) { selinux_netlbl_err(skb, family, err, 1); return NF_DROP; } } if (secmark_active) if (avc_has_perm(&selinux_state, peer_sid, skb->secmark, SECCLASS_PACKET, PACKET__FORWARD_IN, &ad)) return NF_DROP; if (netlbl_active) /* we do this in the FORWARD path and not the POST_ROUTING * path because we want to make sure we apply the necessary * labeling before IPsec is applied so we can leverage AH * protection */ if (selinux_netlbl_skbuff_setsid(skb, family, peer_sid) != 0) return NF_DROP; return NF_ACCEPT; } static unsigned int selinux_ipv4_forward(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { return selinux_ip_forward(skb, state->in, PF_INET); } #if IS_ENABLED(CONFIG_IPV6) static unsigned int selinux_ipv6_forward(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { return selinux_ip_forward(skb, state->in, PF_INET6); } #endif /* IPV6 */ static unsigned int selinux_ip_output(struct sk_buff *skb, u16 family) { struct sock *sk; u32 sid; if (!netlbl_enabled()) return NF_ACCEPT; /* we do this in the LOCAL_OUT path and not the POST_ROUTING path * because we want to make sure we apply the necessary labeling * before IPsec is applied so we can leverage AH protection */ sk = skb->sk; if (sk) { struct sk_security_struct *sksec; if (sk_listener(sk)) /* if the socket is the listening state then this * packet is a SYN-ACK packet which means it needs to * be labeled based on the connection/request_sock and * not the parent socket. unfortunately, we can't * lookup the request_sock yet as it isn't queued on * the parent socket until after the SYN-ACK is sent. * the "solution" is to simply pass the packet as-is * as any IP option based labeling should be copied * from the initial connection request (in the IP * layer). it is far from ideal, but until we get a * security label in the packet itself this is the * best we can do. */ return NF_ACCEPT; /* standard practice, label using the parent socket */ sksec = sk->sk_security; sid = sksec->sid; } else sid = SECINITSID_KERNEL; if (selinux_netlbl_skbuff_setsid(skb, family, sid) != 0) return NF_DROP; return NF_ACCEPT; } static unsigned int selinux_ipv4_output(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { return selinux_ip_output(skb, PF_INET); } #if IS_ENABLED(CONFIG_IPV6) static unsigned int selinux_ipv6_output(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { return selinux_ip_output(skb, PF_INET6); } #endif /* IPV6 */ static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, int ifindex, u16 family) { struct sock *sk = skb_to_full_sk(skb); struct sk_security_struct *sksec; struct common_audit_data ad; struct lsm_network_audit net = {0,}; char *addrp; u8 proto = 0; if (sk == NULL) return NF_ACCEPT; sksec = sk->sk_security; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->netif = ifindex; ad.u.net->family = family; if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto)) return NF_DROP; if (selinux_secmark_enabled()) if (avc_has_perm(&selinux_state, sksec->sid, skb->secmark, SECCLASS_PACKET, PACKET__SEND, &ad)) return NF_DROP_ERR(-ECONNREFUSED); if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto)) return NF_DROP_ERR(-ECONNREFUSED); return NF_ACCEPT; } static unsigned int selinux_ip_postroute(struct sk_buff *skb, const struct net_device *outdev, u16 family) { u32 secmark_perm; u32 peer_sid; int ifindex = outdev->ifindex; struct sock *sk; struct common_audit_data ad; struct lsm_network_audit net = {0,}; char *addrp; u8 secmark_active; u8 peerlbl_active; /* If any sort of compatibility mode is enabled then handoff processing * to the selinux_ip_postroute_compat() function to deal with the * special handling. We do this in an attempt to keep this function * as fast and as clean as possible. */ if (!selinux_policycap_netpeer()) return selinux_ip_postroute_compat(skb, ifindex, family); secmark_active = selinux_secmark_enabled(); peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return NF_ACCEPT; sk = skb_to_full_sk(skb); #ifdef CONFIG_XFRM /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec * packet transformation so allow the packet to pass without any checks * since we'll have another chance to perform access control checks * when the packet is on it's final way out. * NOTE: there appear to be some IPv6 multicast cases where skb->dst * is NULL, in this case go ahead and apply access control. * NOTE: if this is a local socket (skb->sk != NULL) that is in the * TCP listening state we cannot wait until the XFRM processing * is done as we will miss out on the SA label if we do; * unfortunately, this means more work, but it is only once per * connection. */ if (skb_dst(skb) != NULL && skb_dst(skb)->xfrm != NULL && !(sk && sk_listener(sk))) return NF_ACCEPT; #endif if (sk == NULL) { /* Without an associated socket the packet is either coming * from the kernel or it is being forwarded; check the packet * to determine which and if the packet is being forwarded * query the packet directly to determine the security label. */ if (skb->skb_iif) { secmark_perm = PACKET__FORWARD_OUT; if (selinux_skb_peerlbl_sid(skb, family, &peer_sid)) return NF_DROP; } else { secmark_perm = PACKET__SEND; peer_sid = SECINITSID_KERNEL; } } else if (sk_listener(sk)) { /* Locally generated packet but the associated socket is in the * listening state which means this is a SYN-ACK packet. In * this particular case the correct security label is assigned * to the connection/request_sock but unfortunately we can't * query the request_sock as it isn't queued on the parent * socket until after the SYN-ACK packet is sent; the only * viable choice is to regenerate the label like we do in * selinux_inet_conn_request(). See also selinux_ip_output() * for similar problems. */ u32 skb_sid; struct sk_security_struct *sksec; sksec = sk->sk_security; if (selinux_skb_peerlbl_sid(skb, family, &skb_sid)) return NF_DROP; /* At this point, if the returned skb peerlbl is SECSID_NULL * and the packet has been through at least one XFRM * transformation then we must be dealing with the "final" * form of labeled IPsec packet; since we've already applied * all of our access controls on this packet we can safely * pass the packet. */ if (skb_sid == SECSID_NULL) { switch (family) { case PF_INET: if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return NF_ACCEPT; break; case PF_INET6: if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return NF_ACCEPT; break; default: return NF_DROP_ERR(-ECONNREFUSED); } } if (selinux_conn_sid(sksec->sid, skb_sid, &peer_sid)) return NF_DROP; secmark_perm = PACKET__SEND; } else { /* Locally generated packet, fetch the security label from the * associated socket. */ struct sk_security_struct *sksec = sk->sk_security; peer_sid = sksec->sid; secmark_perm = PACKET__SEND; } ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->netif = ifindex; ad.u.net->family = family; if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL)) return NF_DROP; if (secmark_active) if (avc_has_perm(&selinux_state, peer_sid, skb->secmark, SECCLASS_PACKET, secmark_perm, &ad)) return NF_DROP_ERR(-ECONNREFUSED); if (peerlbl_active) { u32 if_sid; u32 node_sid; if (sel_netif_sid(dev_net(outdev), ifindex, &if_sid)) return NF_DROP; if (avc_has_perm(&selinux_state, peer_sid, if_sid, SECCLASS_NETIF, NETIF__EGRESS, &ad)) return NF_DROP_ERR(-ECONNREFUSED); if (sel_netnode_sid(addrp, family, &node_sid)) return NF_DROP; if (avc_has_perm(&selinux_state, peer_sid, node_sid, SECCLASS_NODE, NODE__SENDTO, &ad)) return NF_DROP_ERR(-ECONNREFUSED); } return NF_ACCEPT; } static unsigned int selinux_ipv4_postroute(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { return selinux_ip_postroute(skb, state->out, PF_INET); } #if IS_ENABLED(CONFIG_IPV6) static unsigned int selinux_ipv6_postroute(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { return selinux_ip_postroute(skb, state->out, PF_INET6); } #endif /* IPV6 */ #endif /* CONFIG_NETFILTER */ static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb) { int rc = 0; unsigned int msg_len; unsigned int data_len = skb->len; unsigned char *data = skb->data; struct nlmsghdr *nlh; struct sk_security_struct *sksec = sk->sk_security; u16 sclass = sksec->sclass; u32 perm; while (data_len >= nlmsg_total_size(0)) { nlh = (struct nlmsghdr *)data; /* NOTE: the nlmsg_len field isn't reliably set by some netlink * users which means we can't reject skb's with bogus * length fields; our solution is to follow what * netlink_rcv_skb() does and simply skip processing at * messages with length fields that are clearly junk */ if (nlh->nlmsg_len < NLMSG_HDRLEN || nlh->nlmsg_len > data_len) return 0; rc = selinux_nlmsg_lookup(sclass, nlh->nlmsg_type, &perm); if (rc == 0) { rc = sock_has_perm(sk, perm); if (rc) return rc; } else if (rc == -EINVAL) { /* -EINVAL is a missing msg/perm mapping */ pr_warn_ratelimited("SELinux: unrecognized netlink" " message: protocol=%hu nlmsg_type=%hu sclass=%s" " pid=%d comm=%s\n", sk->sk_protocol, nlh->nlmsg_type, secclass_map[sclass - 1].name, task_pid_nr(current), current->comm); if (enforcing_enabled(&selinux_state) && !security_get_allow_unknown(&selinux_state)) return rc; rc = 0; } else if (rc == -ENOENT) { /* -ENOENT is a missing socket/class mapping, ignore */ rc = 0; } else { return rc; } /* move to the next message after applying netlink padding */ msg_len = NLMSG_ALIGN(nlh->nlmsg_len); if (msg_len >= data_len) return 0; data_len -= msg_len; data += msg_len; } return rc; } static void ipc_init_security(struct ipc_security_struct *isec, u16 sclass) { isec->sclass = sclass; isec->sid = current_sid(); } static int ipc_has_perm(struct kern_ipc_perm *ipc_perms, u32 perms) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(ipc_perms); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = ipc_perms->key; return avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, perms, &ad); } static int selinux_msg_msg_alloc_security(struct msg_msg *msg) { struct msg_security_struct *msec; msec = selinux_msg_msg(msg); msec->sid = SECINITSID_UNLABELED; return 0; } /* message queue security operations */ static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); int rc; isec = selinux_ipc(msq); ipc_init_security(isec, SECCLASS_MSGQ); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; rc = avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_MSGQ, MSGQ__CREATE, &ad); return rc; } static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(msq); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_MSGQ, MSGQ__ASSOCIATE, &ad); } static int selinux_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) { int err; int perms; switch (cmd) { case IPC_INFO: case MSG_INFO: /* No specific object, just general system-wide information. */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case MSG_STAT: case MSG_STAT_ANY: perms = MSGQ__GETATTR | MSGQ__ASSOCIATE; break; case IPC_SET: perms = MSGQ__SETATTR; break; case IPC_RMID: perms = MSGQ__DESTROY; break; default: return 0; } err = ipc_has_perm(msq, perms); return err; } static int selinux_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg) { struct ipc_security_struct *isec; struct msg_security_struct *msec; struct common_audit_data ad; u32 sid = current_sid(); int rc; isec = selinux_ipc(msq); msec = selinux_msg_msg(msg); /* * First time through, need to assign label to the message */ if (msec->sid == SECINITSID_UNLABELED) { /* * Compute new sid based on current process and * message queue this message will be stored in */ rc = security_transition_sid(&selinux_state, sid, isec->sid, SECCLASS_MSG, NULL, &msec->sid); if (rc) return rc; } ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; /* Can this process write to the queue? */ rc = avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_MSGQ, MSGQ__WRITE, &ad); if (!rc) /* Can this process send the message */ rc = avc_has_perm(&selinux_state, sid, msec->sid, SECCLASS_MSG, MSG__SEND, &ad); if (!rc) /* Can the message be put in the queue? */ rc = avc_has_perm(&selinux_state, msec->sid, isec->sid, SECCLASS_MSGQ, MSGQ__ENQUEUE, &ad); return rc; } static int selinux_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg, struct task_struct *target, long type, int mode) { struct ipc_security_struct *isec; struct msg_security_struct *msec; struct common_audit_data ad; u32 sid = task_sid(target); int rc; isec = selinux_ipc(msq); msec = selinux_msg_msg(msg); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; rc = avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_MSGQ, MSGQ__READ, &ad); if (!rc) rc = avc_has_perm(&selinux_state, sid, msec->sid, SECCLASS_MSG, MSG__RECEIVE, &ad); return rc; } /* Shared Memory security operations */ static int selinux_shm_alloc_security(struct kern_ipc_perm *shp) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); int rc; isec = selinux_ipc(shp); ipc_init_security(isec, SECCLASS_SHM); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = shp->key; rc = avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_SHM, SHM__CREATE, &ad); return rc; } static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(shp); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = shp->key; return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_SHM, SHM__ASSOCIATE, &ad); } /* Note, at this point, shp is locked down */ static int selinux_shm_shmctl(struct kern_ipc_perm *shp, int cmd) { int perms; int err; switch (cmd) { case IPC_INFO: case SHM_INFO: /* No specific object, just general system-wide information. */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case SHM_STAT: case SHM_STAT_ANY: perms = SHM__GETATTR | SHM__ASSOCIATE; break; case IPC_SET: perms = SHM__SETATTR; break; case SHM_LOCK: case SHM_UNLOCK: perms = SHM__LOCK; break; case IPC_RMID: perms = SHM__DESTROY; break; default: return 0; } err = ipc_has_perm(shp, perms); return err; } static int selinux_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg) { u32 perms; if (shmflg & SHM_RDONLY) perms = SHM__READ; else perms = SHM__READ | SHM__WRITE; return ipc_has_perm(shp, perms); } /* Semaphore security operations */ static int selinux_sem_alloc_security(struct kern_ipc_perm *sma) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); int rc; isec = selinux_ipc(sma); ipc_init_security(isec, SECCLASS_SEM); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = sma->key; rc = avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_SEM, SEM__CREATE, &ad); return rc; } static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(sma); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = sma->key; return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_SEM, SEM__ASSOCIATE, &ad); } /* Note, at this point, sma is locked down */ static int selinux_sem_semctl(struct kern_ipc_perm *sma, int cmd) { int err; u32 perms; switch (cmd) { case IPC_INFO: case SEM_INFO: /* No specific object, just general system-wide information. */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case GETPID: case GETNCNT: case GETZCNT: perms = SEM__GETATTR; break; case GETVAL: case GETALL: perms = SEM__READ; break; case SETVAL: case SETALL: perms = SEM__WRITE; break; case IPC_RMID: perms = SEM__DESTROY; break; case IPC_SET: perms = SEM__SETATTR; break; case IPC_STAT: case SEM_STAT: case SEM_STAT_ANY: perms = SEM__GETATTR | SEM__ASSOCIATE; break; default: return 0; } err = ipc_has_perm(sma, perms); return err; } static int selinux_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter) { u32 perms; if (alter) perms = SEM__READ | SEM__WRITE; else perms = SEM__READ; return ipc_has_perm(sma, perms); } static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { u32 av = 0; av = 0; if (flag & S_IRUGO) av |= IPC__UNIX_READ; if (flag & S_IWUGO) av |= IPC__UNIX_WRITE; if (av == 0) return 0; return ipc_has_perm(ipcp, av); } static void selinux_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid) { struct ipc_security_struct *isec = selinux_ipc(ipcp); *secid = isec->sid; } static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode) { if (inode) inode_doinit_with_dentry(inode, dentry); } static int selinux_getprocattr(struct task_struct *p, char *name, char **value) { const struct task_security_struct *__tsec; u32 sid; int error; unsigned len; rcu_read_lock(); __tsec = selinux_cred(__task_cred(p)); if (current != p) { error = avc_has_perm(&selinux_state, current_sid(), __tsec->sid, SECCLASS_PROCESS, PROCESS__GETATTR, NULL); if (error) goto bad; } if (!strcmp(name, "current")) sid = __tsec->sid; else if (!strcmp(name, "prev")) sid = __tsec->osid; else if (!strcmp(name, "exec")) sid = __tsec->exec_sid; else if (!strcmp(name, "fscreate")) sid = __tsec->create_sid; else if (!strcmp(name, "keycreate")) sid = __tsec->keycreate_sid; else if (!strcmp(name, "sockcreate")) sid = __tsec->sockcreate_sid; else { error = -EINVAL; goto bad; } rcu_read_unlock(); if (!sid) return 0; error = security_sid_to_context(&selinux_state, sid, value, &len); if (error) return error; return len; bad: rcu_read_unlock(); return error; } static int selinux_setprocattr(const char *name, void *value, size_t size) { struct task_security_struct *tsec; struct cred *new; u32 mysid = current_sid(), sid = 0, ptsid; int error; char *str = value; /* * Basic control over ability to set these attributes at all. */ if (!strcmp(name, "exec")) error = avc_has_perm(&selinux_state, mysid, mysid, SECCLASS_PROCESS, PROCESS__SETEXEC, NULL); else if (!strcmp(name, "fscreate")) error = avc_has_perm(&selinux_state, mysid, mysid, SECCLASS_PROCESS, PROCESS__SETFSCREATE, NULL); else if (!strcmp(name, "keycreate")) error = avc_has_perm(&selinux_state, mysid, mysid, SECCLASS_PROCESS, PROCESS__SETKEYCREATE, NULL); else if (!strcmp(name, "sockcreate")) error = avc_has_perm(&selinux_state, mysid, mysid, SECCLASS_PROCESS, PROCESS__SETSOCKCREATE, NULL); else if (!strcmp(name, "current")) error = avc_has_perm(&selinux_state, mysid, mysid, SECCLASS_PROCESS, PROCESS__SETCURRENT, NULL); else error = -EINVAL; if (error) return error; /* Obtain a SID for the context, if one was specified. */ if (size && str[0] && str[0] != '\n') { if (str[size-1] == '\n') { str[size-1] = 0; size--; } error = security_context_to_sid(&selinux_state, value, size, &sid, GFP_KERNEL); if (error == -EINVAL && !strcmp(name, "fscreate")) { if (!has_cap_mac_admin(true)) { struct audit_buffer *ab; size_t audit_size; /* We strip a nul only if it is at the end, otherwise the * context contains a nul and we should audit that */ if (str[size - 1] == '\0') audit_size = size - 1; else audit_size = size; ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); audit_log_format(ab, "op=fscreate invalid_context="); audit_log_n_untrustedstring(ab, value, audit_size); audit_log_end(ab); return error; } error = security_context_to_sid_force( &selinux_state, value, size, &sid); } if (error) return error; } new = prepare_creds(); if (!new) return -ENOMEM; /* Permission checking based on the specified context is performed during the actual operation (execve, open/mkdir/...), when we know the full context of the operation. See selinux_bprm_creds_for_exec for the execve checks and may_create for the file creation checks. The operation will then fail if the context is not permitted. */ tsec = selinux_cred(new); if (!strcmp(name, "exec")) { tsec->exec_sid = sid; } else if (!strcmp(name, "fscreate")) { tsec->create_sid = sid; } else if (!strcmp(name, "keycreate")) { if (sid) { error = avc_has_perm(&selinux_state, mysid, sid, SECCLASS_KEY, KEY__CREATE, NULL); if (error) goto abort_change; } tsec->keycreate_sid = sid; } else if (!strcmp(name, "sockcreate")) { tsec->sockcreate_sid = sid; } else if (!strcmp(name, "current")) { error = -EINVAL; if (sid == 0) goto abort_change; /* Only allow single threaded processes to change context */ error = -EPERM; if (!current_is_single_threaded()) { error = security_bounded_transition(&selinux_state, tsec->sid, sid); if (error) goto abort_change; } /* Check permissions for the transition. */ error = avc_has_perm(&selinux_state, tsec->sid, sid, SECCLASS_PROCESS, PROCESS__DYNTRANSITION, NULL); if (error) goto abort_change; /* Check for ptracing, and update the task SID if ok. Otherwise, leave SID unchanged and fail. */ ptsid = ptrace_parent_sid(); if (ptsid != 0) { error = avc_has_perm(&selinux_state, ptsid, sid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); if (error) goto abort_change; } tsec->sid = sid; } else { error = -EINVAL; goto abort_change; } commit_creds(new); return size; abort_change: abort_creds(new); return error; } static int selinux_ismaclabel(const char *name) { return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0); } static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { return security_sid_to_context(&selinux_state, secid, secdata, seclen); } static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { return security_context_to_sid(&selinux_state, secdata, seclen, secid, GFP_KERNEL); } static void selinux_release_secctx(char *secdata, u32 seclen) { kfree(secdata); } static void selinux_inode_invalidate_secctx(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); spin_lock(&isec->lock); isec->initialized = LABEL_INVALID; spin_unlock(&isec->lock); } /* * called with inode->i_mutex locked */ static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) { int rc = selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX, ctx, ctxlen, 0); /* Do not return error when suppressing label (SBLABEL_MNT not set). */ return rc == -EOPNOTSUPP ? 0 : rc; } /* * called with inode->i_mutex locked */ static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { return __vfs_setxattr_noperm(dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0); } static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) { int len = 0; len = selinux_inode_getsecurity(inode, XATTR_SELINUX_SUFFIX, ctx, true); if (len < 0) return len; *ctxlen = len; return 0; } #ifdef CONFIG_KEYS static int selinux_key_alloc(struct key *k, const struct cred *cred, unsigned long flags) { const struct task_security_struct *tsec; struct key_security_struct *ksec; ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL); if (!ksec) return -ENOMEM; tsec = selinux_cred(cred); if (tsec->keycreate_sid) ksec->sid = tsec->keycreate_sid; else ksec->sid = tsec->sid; k->security = ksec; return 0; } static void selinux_key_free(struct key *k) { struct key_security_struct *ksec = k->security; k->security = NULL; kfree(ksec); } static int selinux_key_permission(key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { struct key *key; struct key_security_struct *ksec; u32 perm, sid; switch (need_perm) { case KEY_NEED_VIEW: perm = KEY__VIEW; break; case KEY_NEED_READ: perm = KEY__READ; break; case KEY_NEED_WRITE: perm = KEY__WRITE; break; case KEY_NEED_SEARCH: perm = KEY__SEARCH; break; case KEY_NEED_LINK: perm = KEY__LINK; break; case KEY_NEED_SETATTR: perm = KEY__SETATTR; break; case KEY_NEED_UNLINK: case KEY_SYSADMIN_OVERRIDE: case KEY_AUTHTOKEN_OVERRIDE: case KEY_DEFER_PERM_CHECK: return 0; default: WARN_ON(1); return -EPERM; } sid = cred_sid(cred); key = key_ref_to_ptr(key_ref); ksec = key->security; return avc_has_perm(&selinux_state, sid, ksec->sid, SECCLASS_KEY, perm, NULL); } static int selinux_key_getsecurity(struct key *key, char **_buffer) { struct key_security_struct *ksec = key->security; char *context = NULL; unsigned len; int rc; rc = security_sid_to_context(&selinux_state, ksec->sid, &context, &len); if (!rc) rc = len; *_buffer = context; return rc; } #ifdef CONFIG_KEY_NOTIFICATIONS static int selinux_watch_key(struct key *key) { struct key_security_struct *ksec = key->security; u32 sid = current_sid(); return avc_has_perm(&selinux_state, sid, ksec->sid, SECCLASS_KEY, KEY__VIEW, NULL); } #endif #endif #ifdef CONFIG_SECURITY_INFINIBAND static int selinux_ib_pkey_access(void *ib_sec, u64 subnet_prefix, u16 pkey_val) { struct common_audit_data ad; int err; u32 sid = 0; struct ib_security_struct *sec = ib_sec; struct lsm_ibpkey_audit ibpkey; err = sel_ib_pkey_sid(subnet_prefix, pkey_val, &sid); if (err) return err; ad.type = LSM_AUDIT_DATA_IBPKEY; ibpkey.subnet_prefix = subnet_prefix; ibpkey.pkey = pkey_val; ad.u.ibpkey = &ibpkey; return avc_has_perm(&selinux_state, sec->sid, sid, SECCLASS_INFINIBAND_PKEY, INFINIBAND_PKEY__ACCESS, &ad); } static int selinux_ib_endport_manage_subnet(void *ib_sec, const char *dev_name, u8 port_num) { struct common_audit_data ad; int err; u32 sid = 0; struct ib_security_struct *sec = ib_sec; struct lsm_ibendport_audit ibendport; err = security_ib_endport_sid(&selinux_state, dev_name, port_num, &sid); if (err) return err; ad.type = LSM_AUDIT_DATA_IBENDPORT; strncpy(ibendport.dev_name, dev_name, sizeof(ibendport.dev_name)); ibendport.port = port_num; ad.u.ibendport = &ibendport; return avc_has_perm(&selinux_state, sec->sid, sid, SECCLASS_INFINIBAND_ENDPORT, INFINIBAND_ENDPORT__MANAGE_SUBNET, &ad); } static int selinux_ib_alloc_security(void **ib_sec) { struct ib_security_struct *sec; sec = kzalloc(sizeof(*sec), GFP_KERNEL); if (!sec) return -ENOMEM; sec->sid = current_sid(); *ib_sec = sec; return 0; } static void selinux_ib_free_security(void *ib_sec) { kfree(ib_sec); } #endif #ifdef CONFIG_BPF_SYSCALL static int selinux_bpf(int cmd, union bpf_attr *attr, unsigned int size) { u32 sid = current_sid(); int ret; switch (cmd) { case BPF_MAP_CREATE: ret = avc_has_perm(&selinux_state, sid, sid, SECCLASS_BPF, BPF__MAP_CREATE, NULL); break; case BPF_PROG_LOAD: ret = avc_has_perm(&selinux_state, sid, sid, SECCLASS_BPF, BPF__PROG_LOAD, NULL); break; default: ret = 0; break; } return ret; } static u32 bpf_map_fmode_to_av(fmode_t fmode) { u32 av = 0; if (fmode & FMODE_READ) av |= BPF__MAP_READ; if (fmode & FMODE_WRITE) av |= BPF__MAP_WRITE; return av; } /* This function will check the file pass through unix socket or binder to see * if it is a bpf related object. And apply correspinding checks on the bpf * object based on the type. The bpf maps and programs, not like other files and * socket, are using a shared anonymous inode inside the kernel as their inode. * So checking that inode cannot identify if the process have privilege to * access the bpf object and that's why we have to add this additional check in * selinux_file_receive and selinux_binder_transfer_files. */ static int bpf_fd_pass(struct file *file, u32 sid) { struct bpf_security_struct *bpfsec; struct bpf_prog *prog; struct bpf_map *map; int ret; if (file->f_op == &bpf_map_fops) { map = file->private_data; bpfsec = map->security; ret = avc_has_perm(&selinux_state, sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(file->f_mode), NULL); if (ret) return ret; } else if (file->f_op == &bpf_prog_fops) { prog = file->private_data; bpfsec = prog->aux->security; ret = avc_has_perm(&selinux_state, sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); if (ret) return ret; } return 0; } static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode) { u32 sid = current_sid(); struct bpf_security_struct *bpfsec; bpfsec = map->security; return avc_has_perm(&selinux_state, sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(fmode), NULL); } static int selinux_bpf_prog(struct bpf_prog *prog) { u32 sid = current_sid(); struct bpf_security_struct *bpfsec; bpfsec = prog->aux->security; return avc_has_perm(&selinux_state, sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); } static int selinux_bpf_map_alloc(struct bpf_map *map) { struct bpf_security_struct *bpfsec; bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); if (!bpfsec) return -ENOMEM; bpfsec->sid = current_sid(); map->security = bpfsec; return 0; } static void selinux_bpf_map_free(struct bpf_map *map) { struct bpf_security_struct *bpfsec = map->security; map->security = NULL; kfree(bpfsec); } static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux) { struct bpf_security_struct *bpfsec; bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); if (!bpfsec) return -ENOMEM; bpfsec->sid = current_sid(); aux->security = bpfsec; return 0; } static void selinux_bpf_prog_free(struct bpf_prog_aux *aux) { struct bpf_security_struct *bpfsec = aux->security; aux->security = NULL; kfree(bpfsec); } #endif static int selinux_lockdown(enum lockdown_reason what) { struct common_audit_data ad; u32 sid = current_sid(); int invalid_reason = (what <= LOCKDOWN_NONE) || (what == LOCKDOWN_INTEGRITY_MAX) || (what >= LOCKDOWN_CONFIDENTIALITY_MAX); if (WARN(invalid_reason, "Invalid lockdown reason")) { audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR, "lockdown_reason=invalid"); return -EINVAL; } ad.type = LSM_AUDIT_DATA_LOCKDOWN; ad.u.reason = what; if (what <= LOCKDOWN_INTEGRITY_MAX) return avc_has_perm(&selinux_state, sid, sid, SECCLASS_LOCKDOWN, LOCKDOWN__INTEGRITY, &ad); else return avc_has_perm(&selinux_state, sid, sid, SECCLASS_LOCKDOWN, LOCKDOWN__CONFIDENTIALITY, &ad); } struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = { .lbs_cred = sizeof(struct task_security_struct), .lbs_file = sizeof(struct file_security_struct), .lbs_inode = sizeof(struct inode_security_struct), .lbs_ipc = sizeof(struct ipc_security_struct), .lbs_msg_msg = sizeof(struct msg_security_struct), }; #ifdef CONFIG_PERF_EVENTS static int selinux_perf_event_open(struct perf_event_attr *attr, int type) { u32 requested, sid = current_sid(); if (type == PERF_SECURITY_OPEN) requested = PERF_EVENT__OPEN; else if (type == PERF_SECURITY_CPU) requested = PERF_EVENT__CPU; else if (type == PERF_SECURITY_KERNEL) requested = PERF_EVENT__KERNEL; else if (type == PERF_SECURITY_TRACEPOINT) requested = PERF_EVENT__TRACEPOINT; else return -EINVAL; return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PERF_EVENT, requested, NULL); } static int selinux_perf_event_alloc(struct perf_event *event) { struct perf_event_security_struct *perfsec; perfsec = kzalloc(sizeof(*perfsec), GFP_KERNEL); if (!perfsec) return -ENOMEM; perfsec->sid = current_sid(); event->security = perfsec; return 0; } static void selinux_perf_event_free(struct perf_event *event) { struct perf_event_security_struct *perfsec = event->security; event->security = NULL; kfree(perfsec); } static int selinux_perf_event_read(struct perf_event *event) { struct perf_event_security_struct *perfsec = event->security; u32 sid = current_sid(); return avc_has_perm(&selinux_state, sid, perfsec->sid, SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL); } static int selinux_perf_event_write(struct perf_event *event) { struct perf_event_security_struct *perfsec = event->security; u32 sid = current_sid(); return avc_has_perm(&selinux_state, sid, perfsec->sid, SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL); } #endif /* * IMPORTANT NOTE: When adding new hooks, please be careful to keep this order: * 1. any hooks that don't belong to (2.) or (3.) below, * 2. hooks that both access structures allocated by other hooks, and allocate * structures that can be later accessed by other hooks (mostly "cloning" * hooks), * 3. hooks that only allocate structures that can be later accessed by other * hooks ("allocating" hooks). * * Please follow block comment delimiters in the list to keep this order. * * This ordering is needed for SELinux runtime disable to work at least somewhat * safely. Breaking the ordering rules above might lead to NULL pointer derefs * when disabling SELinux at runtime. */ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr), LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction), LSM_HOOK_INIT(binder_transfer_binder, selinux_binder_transfer_binder), LSM_HOOK_INIT(binder_transfer_file, selinux_binder_transfer_file), LSM_HOOK_INIT(ptrace_access_check, selinux_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, selinux_ptrace_traceme), LSM_HOOK_INIT(capget, selinux_capget), LSM_HOOK_INIT(capset, selinux_capset), LSM_HOOK_INIT(capable, selinux_capable), LSM_HOOK_INIT(quotactl, selinux_quotactl), LSM_HOOK_INIT(quota_on, selinux_quota_on), LSM_HOOK_INIT(syslog, selinux_syslog), LSM_HOOK_INIT(vm_enough_memory, selinux_vm_enough_memory), LSM_HOOK_INIT(netlink_send, selinux_netlink_send), LSM_HOOK_INIT(bprm_creds_for_exec, selinux_bprm_creds_for_exec), LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds), LSM_HOOK_INIT(sb_free_security, selinux_sb_free_security), LSM_HOOK_INIT(sb_free_mnt_opts, selinux_free_mnt_opts), LSM_HOOK_INIT(sb_remount, selinux_sb_remount), LSM_HOOK_INIT(sb_kern_mount, selinux_sb_kern_mount), LSM_HOOK_INIT(sb_show_options, selinux_sb_show_options), LSM_HOOK_INIT(sb_statfs, selinux_sb_statfs), LSM_HOOK_INIT(sb_mount, selinux_mount), LSM_HOOK_INIT(sb_umount, selinux_umount), LSM_HOOK_INIT(sb_set_mnt_opts, selinux_set_mnt_opts), LSM_HOOK_INIT(sb_clone_mnt_opts, selinux_sb_clone_mnt_opts), LSM_HOOK_INIT(move_mount, selinux_move_mount), LSM_HOOK_INIT(dentry_init_security, selinux_dentry_init_security), LSM_HOOK_INIT(dentry_create_files_as, selinux_dentry_create_files_as), LSM_HOOK_INIT(inode_free_security, selinux_inode_free_security), LSM_HOOK_INIT(inode_init_security, selinux_inode_init_security), LSM_HOOK_INIT(inode_create, selinux_inode_create), LSM_HOOK_INIT(inode_link, selinux_inode_link), LSM_HOOK_INIT(inode_unlink, selinux_inode_unlink), LSM_HOOK_INIT(inode_symlink, selinux_inode_symlink), LSM_HOOK_INIT(inode_mkdir, selinux_inode_mkdir), LSM_HOOK_INIT(inode_rmdir, selinux_inode_rmdir), LSM_HOOK_INIT(inode_mknod, selinux_inode_mknod), LSM_HOOK_INIT(inode_rename, selinux_inode_rename), LSM_HOOK_INIT(inode_readlink, selinux_inode_readlink), LSM_HOOK_INIT(inode_follow_link, selinux_inode_follow_link), LSM_HOOK_INIT(inode_permission, selinux_inode_permission), LSM_HOOK_INIT(inode_setattr, selinux_inode_setattr), LSM_HOOK_INIT(inode_getattr, selinux_inode_getattr), LSM_HOOK_INIT(inode_setxattr, selinux_inode_setxattr), LSM_HOOK_INIT(inode_post_setxattr, selinux_inode_post_setxattr), LSM_HOOK_INIT(inode_getxattr, selinux_inode_getxattr), LSM_HOOK_INIT(inode_listxattr, selinux_inode_listxattr), LSM_HOOK_INIT(inode_removexattr, selinux_inode_removexattr), LSM_HOOK_INIT(inode_getsecurity, selinux_inode_getsecurity), LSM_HOOK_INIT(inode_setsecurity, selinux_inode_setsecurity), LSM_HOOK_INIT(inode_listsecurity, selinux_inode_listsecurity), LSM_HOOK_INIT(inode_getsecid, selinux_inode_getsecid), LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up), LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr), LSM_HOOK_INIT(path_notify, selinux_path_notify), LSM_HOOK_INIT(kernfs_init_security, selinux_kernfs_init_security), LSM_HOOK_INIT(file_permission, selinux_file_permission), LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security), LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl), LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat), LSM_HOOK_INIT(mmap_file, selinux_mmap_file), LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr), LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect), LSM_HOOK_INIT(file_lock, selinux_file_lock), LSM_HOOK_INIT(file_fcntl, selinux_file_fcntl), LSM_HOOK_INIT(file_set_fowner, selinux_file_set_fowner), LSM_HOOK_INIT(file_send_sigiotask, selinux_file_send_sigiotask), LSM_HOOK_INIT(file_receive, selinux_file_receive), LSM_HOOK_INIT(file_open, selinux_file_open), LSM_HOOK_INIT(task_alloc, selinux_task_alloc), LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare), LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer), LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid), LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as), LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as), LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request), LSM_HOOK_INIT(kernel_load_data, selinux_kernel_load_data), LSM_HOOK_INIT(kernel_read_file, selinux_kernel_read_file), LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid), LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid), LSM_HOOK_INIT(task_getsid, selinux_task_getsid), LSM_HOOK_INIT(task_getsecid, selinux_task_getsecid), LSM_HOOK_INIT(task_setnice, selinux_task_setnice), LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio), LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio), LSM_HOOK_INIT(task_prlimit, selinux_task_prlimit), LSM_HOOK_INIT(task_setrlimit, selinux_task_setrlimit), LSM_HOOK_INIT(task_setscheduler, selinux_task_setscheduler), LSM_HOOK_INIT(task_getscheduler, selinux_task_getscheduler), LSM_HOOK_INIT(task_movememory, selinux_task_movememory), LSM_HOOK_INIT(task_kill, selinux_task_kill), LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode), LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission), LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid), LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate), LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl), LSM_HOOK_INIT(msg_queue_msgsnd, selinux_msg_queue_msgsnd), LSM_HOOK_INIT(msg_queue_msgrcv, selinux_msg_queue_msgrcv), LSM_HOOK_INIT(shm_associate, selinux_shm_associate), LSM_HOOK_INIT(shm_shmctl, selinux_shm_shmctl), LSM_HOOK_INIT(shm_shmat, selinux_shm_shmat), LSM_HOOK_INIT(sem_associate, selinux_sem_associate), LSM_HOOK_INIT(sem_semctl, selinux_sem_semctl), LSM_HOOK_INIT(sem_semop, selinux_sem_semop), LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate), LSM_HOOK_INIT(getprocattr, selinux_getprocattr), LSM_HOOK_INIT(setprocattr, selinux_setprocattr), LSM_HOOK_INIT(ismaclabel, selinux_ismaclabel), LSM_HOOK_INIT(secctx_to_secid, selinux_secctx_to_secid), LSM_HOOK_INIT(release_secctx, selinux_release_secctx), LSM_HOOK_INIT(inode_invalidate_secctx, selinux_inode_invalidate_secctx), LSM_HOOK_INIT(inode_notifysecctx, selinux_inode_notifysecctx), LSM_HOOK_INIT(inode_setsecctx, selinux_inode_setsecctx), LSM_HOOK_INIT(unix_stream_connect, selinux_socket_unix_stream_connect), LSM_HOOK_INIT(unix_may_send, selinux_socket_unix_may_send), LSM_HOOK_INIT(socket_create, selinux_socket_create), LSM_HOOK_INIT(socket_post_create, selinux_socket_post_create), LSM_HOOK_INIT(socket_socketpair, selinux_socket_socketpair), LSM_HOOK_INIT(socket_bind, selinux_socket_bind), LSM_HOOK_INIT(socket_connect, selinux_socket_connect), LSM_HOOK_INIT(socket_listen, selinux_socket_listen), LSM_HOOK_INIT(socket_accept, selinux_socket_accept), LSM_HOOK_INIT(socket_sendmsg, selinux_socket_sendmsg), LSM_HOOK_INIT(socket_recvmsg, selinux_socket_recvmsg), LSM_HOOK_INIT(socket_getsockname, selinux_socket_getsockname), LSM_HOOK_INIT(socket_getpeername, selinux_socket_getpeername), LSM_HOOK_INIT(socket_getsockopt, selinux_socket_getsockopt), LSM_HOOK_INIT(socket_setsockopt, selinux_socket_setsockopt), LSM_HOOK_INIT(socket_shutdown, selinux_socket_shutdown), LSM_HOOK_INIT(socket_sock_rcv_skb, selinux_socket_sock_rcv_skb), LSM_HOOK_INIT(socket_getpeersec_stream, selinux_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, selinux_socket_getpeersec_dgram), LSM_HOOK_INIT(sk_free_security, selinux_sk_free_security), LSM_HOOK_INIT(sk_clone_security, selinux_sk_clone_security), LSM_HOOK_INIT(sk_getsecid, selinux_sk_getsecid), LSM_HOOK_INIT(sock_graft, selinux_sock_graft), LSM_HOOK_INIT(sctp_assoc_request, selinux_sctp_assoc_request), LSM_HOOK_INIT(sctp_sk_clone, selinux_sctp_sk_clone), LSM_HOOK_INIT(sctp_bind_connect, selinux_sctp_bind_connect), LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request), LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone), LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established), LSM_HOOK_INIT(secmark_relabel_packet, selinux_secmark_relabel_packet), LSM_HOOK_INIT(secmark_refcount_inc, selinux_secmark_refcount_inc), LSM_HOOK_INIT(secmark_refcount_dec, selinux_secmark_refcount_dec), LSM_HOOK_INIT(req_classify_flow, selinux_req_classify_flow), LSM_HOOK_INIT(tun_dev_free_security, selinux_tun_dev_free_security), LSM_HOOK_INIT(tun_dev_create, selinux_tun_dev_create), LSM_HOOK_INIT(tun_dev_attach_queue, selinux_tun_dev_attach_queue), LSM_HOOK_INIT(tun_dev_attach, selinux_tun_dev_attach), LSM_HOOK_INIT(tun_dev_open, selinux_tun_dev_open), #ifdef CONFIG_SECURITY_INFINIBAND LSM_HOOK_INIT(ib_pkey_access, selinux_ib_pkey_access), LSM_HOOK_INIT(ib_endport_manage_subnet, selinux_ib_endport_manage_subnet), LSM_HOOK_INIT(ib_free_security, selinux_ib_free_security), #endif #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK_INIT(xfrm_policy_free_security, selinux_xfrm_policy_free), LSM_HOOK_INIT(xfrm_policy_delete_security, selinux_xfrm_policy_delete), LSM_HOOK_INIT(xfrm_state_free_security, selinux_xfrm_state_free), LSM_HOOK_INIT(xfrm_state_delete_security, selinux_xfrm_state_delete), LSM_HOOK_INIT(xfrm_policy_lookup, selinux_xfrm_policy_lookup), LSM_HOOK_INIT(xfrm_state_pol_flow_match, selinux_xfrm_state_pol_flow_match), LSM_HOOK_INIT(xfrm_decode_session, selinux_xfrm_decode_session), #endif #ifdef CONFIG_KEYS LSM_HOOK_INIT(key_free, selinux_key_free), LSM_HOOK_INIT(key_permission, selinux_key_permission), LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity), #ifdef CONFIG_KEY_NOTIFICATIONS LSM_HOOK_INIT(watch_key, selinux_watch_key), #endif #endif #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_known, selinux_audit_rule_known), LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match), LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free), #endif #ifdef CONFIG_BPF_SYSCALL LSM_HOOK_INIT(bpf, selinux_bpf), LSM_HOOK_INIT(bpf_map, selinux_bpf_map), LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog), LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free), LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free), #endif #ifdef CONFIG_PERF_EVENTS LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open), LSM_HOOK_INIT(perf_event_free, selinux_perf_event_free), LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read), LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write), #endif LSM_HOOK_INIT(locked_down, selinux_lockdown), /* * PUT "CLONING" (ACCESSING + ALLOCATING) HOOKS HERE */ LSM_HOOK_INIT(fs_context_dup, selinux_fs_context_dup), LSM_HOOK_INIT(fs_context_parse_param, selinux_fs_context_parse_param), LSM_HOOK_INIT(sb_eat_lsm_opts, selinux_sb_eat_lsm_opts), LSM_HOOK_INIT(sb_add_mnt_opt, selinux_add_mnt_opt), #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK_INIT(xfrm_policy_clone_security, selinux_xfrm_policy_clone), #endif /* * PUT "ALLOCATING" HOOKS HERE */ LSM_HOOK_INIT(msg_msg_alloc_security, selinux_msg_msg_alloc_security), LSM_HOOK_INIT(msg_queue_alloc_security, selinux_msg_queue_alloc_security), LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security), LSM_HOOK_INIT(sb_alloc_security, selinux_sb_alloc_security), LSM_HOOK_INIT(inode_alloc_security, selinux_inode_alloc_security), LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security), LSM_HOOK_INIT(secid_to_secctx, selinux_secid_to_secctx), LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx), LSM_HOOK_INIT(sk_alloc_security, selinux_sk_alloc_security), LSM_HOOK_INIT(tun_dev_alloc_security, selinux_tun_dev_alloc_security), #ifdef CONFIG_SECURITY_INFINIBAND LSM_HOOK_INIT(ib_alloc_security, selinux_ib_alloc_security), #endif #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK_INIT(xfrm_policy_alloc_security, selinux_xfrm_policy_alloc), LSM_HOOK_INIT(xfrm_state_alloc, selinux_xfrm_state_alloc), LSM_HOOK_INIT(xfrm_state_alloc_acquire, selinux_xfrm_state_alloc_acquire), #endif #ifdef CONFIG_KEYS LSM_HOOK_INIT(key_alloc, selinux_key_alloc), #endif #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init), #endif #ifdef CONFIG_BPF_SYSCALL LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc), LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc), #endif #ifdef CONFIG_PERF_EVENTS LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc), #endif }; static __init int selinux_init(void) { pr_info("SELinux: Initializing.\n"); memset(&selinux_state, 0, sizeof(selinux_state)); enforcing_set(&selinux_state, selinux_enforcing_boot); checkreqprot_set(&selinux_state, selinux_checkreqprot_boot); selinux_avc_init(&selinux_state.avc); mutex_init(&selinux_state.status_lock); mutex_init(&selinux_state.policy_mutex); /* Set the security state for the initial task. */ cred_init_security(); default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); avc_init(); avtab_cache_init(); ebitmap_cache_init(); hashtab_cache_init(); security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), "selinux"); if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET)) panic("SELinux: Unable to register AVC netcache callback\n"); if (avc_add_callback(selinux_lsm_notifier_avc_callback, AVC_CALLBACK_RESET)) panic("SELinux: Unable to register AVC LSM notifier callback\n"); if (selinux_enforcing_boot) pr_debug("SELinux: Starting in enforcing mode\n"); else pr_debug("SELinux: Starting in permissive mode\n"); fs_validate_description("selinux", selinux_fs_parameters); return 0; } static void delayed_superblock_init(struct super_block *sb, void *unused) { selinux_set_mnt_opts(sb, NULL, 0, NULL); } void selinux_complete_init(void) { pr_debug("SELinux: Completing initialization.\n"); /* Set up any superblocks initialized prior to the policy load. */ pr_debug("SELinux: Setting up existing superblocks.\n"); iterate_supers(delayed_superblock_init, NULL); } /* SELinux requires early initialization in order to label all processes and objects when they are created. */ DEFINE_LSM(selinux) = { .name = "selinux", .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .enabled = &selinux_enabled_boot, .blobs = &selinux_blob_sizes, .init = selinux_init, }; #if defined(CONFIG_NETFILTER) static const struct nf_hook_ops selinux_nf_ops[] = { { .hook = selinux_ipv4_postroute, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_SELINUX_LAST, }, { .hook = selinux_ipv4_forward, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = NF_IP_PRI_SELINUX_FIRST, }, { .hook = selinux_ipv4_output, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_SELINUX_FIRST, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = selinux_ipv6_postroute, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_SELINUX_LAST, }, { .hook = selinux_ipv6_forward, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = NF_IP6_PRI_SELINUX_FIRST, }, { .hook = selinux_ipv6_output, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_SELINUX_FIRST, }, #endif /* IPV6 */ }; static int __net_init selinux_nf_register(struct net *net) { return nf_register_net_hooks(net, selinux_nf_ops, ARRAY_SIZE(selinux_nf_ops)); } static void __net_exit selinux_nf_unregister(struct net *net) { nf_unregister_net_hooks(net, selinux_nf_ops, ARRAY_SIZE(selinux_nf_ops)); } static struct pernet_operations selinux_net_ops = { .init = selinux_nf_register, .exit = selinux_nf_unregister, }; static int __init selinux_nf_ip_init(void) { int err; if (!selinux_enabled_boot) return 0; pr_debug("SELinux: Registering netfilter hooks\n"); err = register_pernet_subsys(&selinux_net_ops); if (err) panic("SELinux: register_pernet_subsys: error %d\n", err); return 0; } __initcall(selinux_nf_ip_init); #ifdef CONFIG_SECURITY_SELINUX_DISABLE static void selinux_nf_ip_exit(void) { pr_debug("SELinux: Unregistering netfilter hooks\n"); unregister_pernet_subsys(&selinux_net_ops); } #endif #else /* CONFIG_NETFILTER */ #ifdef CONFIG_SECURITY_SELINUX_DISABLE #define selinux_nf_ip_exit() #endif #endif /* CONFIG_NETFILTER */ #ifdef CONFIG_SECURITY_SELINUX_DISABLE int selinux_disable(struct selinux_state *state) { if (selinux_initialized(state)) { /* Not permitted after initial policy load. */ return -EINVAL; } if (selinux_disabled(state)) { /* Only do this once. */ return -EINVAL; } selinux_mark_disabled(state); pr_info("SELinux: Disabled at runtime.\n"); /* * Unregister netfilter hooks. * Must be done before security_delete_hooks() to avoid breaking * runtime disable. */ selinux_nf_ip_exit(); security_delete_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks)); /* Try to destroy the avc node cache */ avc_disable(); /* Unregister selinuxfs. */ exit_sel_fs(); return 0; } #endif
4 4 8 4 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_CRED_H #define _LINUX_CRED_H #include <linux/capability.h> #include <linux/init.h> #include <linux/key.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/sched.h> #include <linux/sched/user.h> struct cred; struct inode; /* * COW Supplementary groups list */ struct group_info { atomic_t usage; int ngroups; kgid_t gid[0]; } __randomize_layout; /** * get_group_info - Get a reference to a group info structure * @group_info: The group info to reference * * This gets a reference to a set of supplementary groups. * * If the caller is accessing a task's credentials, they must hold the RCU read * lock when reading. */ static inline struct group_info *get_group_info(struct group_info *gi) { atomic_inc(&gi->usage); return gi; } /** * put_group_info - Release a reference to a group info structure * @group_info: The group info to release */ #define put_group_info(group_info) \ do { \ if (atomic_dec_and_test(&(group_info)->usage)) \ groups_free(group_info); \ } while (0) extern struct group_info init_groups; #ifdef CONFIG_MULTIUSER extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); extern int in_group_p(kgid_t); extern int in_egroup_p(kgid_t); extern int groups_search(const struct group_info *, kgid_t); extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern bool may_setgroups(void); extern void groups_sort(struct group_info *); #else static inline void groups_free(struct group_info *group_info) { } static inline int in_group_p(kgid_t grp) { return 1; } static inline int in_egroup_p(kgid_t grp) { return 1; } static inline int groups_search(const struct group_info *group_info, kgid_t grp) { return 1; } #endif /* * The security context of a task * * The parts of the context break down into two categories: * * (1) The objective context of a task. These parts are used when some other * task is attempting to affect this one. * * (2) The subjective context. These details are used when the task is acting * upon another object, be that a file, a task, a key or whatever. * * Note that some members of this structure belong to both categories - the * LSM security pointer for instance. * * A task has two security pointers. task->real_cred points to the objective * context that defines that task's actual details. The objective part of this * context is used whenever that task is acted upon. * * task->cred points to the subjective context that defines the details of how * that task is going to act upon another object. This may be overridden * temporarily to point to another security context, but normally points to the * same context as task->real_cred. */ struct cred { atomic_long_t usage; #ifdef CONFIG_DEBUG_CREDENTIALS atomic_t subscribers; /* number of processes subscribed */ void *put_addr; unsigned magic; #define CRED_MAGIC 0x43736564 #define CRED_MAGIC_DEAD 0x44656144 #endif kuid_t uid; /* real UID of the task */ kgid_t gid; /* real GID of the task */ kuid_t suid; /* saved UID of the task */ kgid_t sgid; /* saved GID of the task */ kuid_t euid; /* effective UID of the task */ kgid_t egid; /* effective GID of the task */ kuid_t fsuid; /* UID for VFS ops */ kgid_t fsgid; /* GID for VFS ops */ unsigned securebits; /* SUID-less security management */ kernel_cap_t cap_inheritable; /* caps our children can inherit */ kernel_cap_t cap_permitted; /* caps we're permitted */ kernel_cap_t cap_effective; /* caps we can actually use */ kernel_cap_t cap_bset; /* capability bounding set */ kernel_cap_t cap_ambient; /* Ambient capability set */ #ifdef CONFIG_KEYS unsigned char jit_keyring; /* default keyring to attach requested * keys to */ struct key *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ #endif #ifdef CONFIG_SECURITY void *security; /* subjective LSM security */ #endif struct user_struct *user; /* real user ID subscription */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ /* RCU deletion */ union { int non_rcu; /* Can we skip RCU deletion? */ struct rcu_head rcu; /* RCU deletion hook */ }; } __randomize_layout; extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, unsigned long); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern const struct cred *override_creds(const struct cred *); extern void revert_creds(const struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); extern int change_create_files_as(struct cred *, struct inode *); extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); /* * check for validity of credentials */ #ifdef CONFIG_DEBUG_CREDENTIALS extern void __invalid_creds(const struct cred *, const char *, unsigned); extern void __validate_process_creds(struct task_struct *, const char *, unsigned); extern bool creds_are_invalid(const struct cred *cred); static inline void __validate_creds(const struct cred *cred, const char *file, unsigned line) { if (unlikely(creds_are_invalid(cred))) __invalid_creds(cred, file, line); } #define validate_creds(cred) \ do { \ __validate_creds((cred), __FILE__, __LINE__); \ } while(0) #define validate_process_creds() \ do { \ __validate_process_creds(current, __FILE__, __LINE__); \ } while(0) extern void validate_creds_for_do_exit(struct task_struct *); #else static inline void validate_creds(const struct cred *cred) { } static inline void validate_creds_for_do_exit(struct task_struct *tsk) { } static inline void validate_process_creds(void) { } #endif static inline bool cap_ambient_invariant_ok(const struct cred *cred) { return cap_issubset(cred->cap_ambient, cap_intersect(cred->cap_permitted, cred->cap_inheritable)); } /** * get_new_cred - Get a reference on a new set of credentials * @cred: The new credentials to reference * * Get a reference on the specified set of new credentials. The caller must * release the reference. */ static inline struct cred *get_new_cred(struct cred *cred) { atomic_long_inc(&cred->usage); return cred; } /** * get_cred - Get a reference on a set of credentials * @cred: The credentials to reference * * Get a reference on the specified set of credentials. The caller must * release the reference. If %NULL is passed, it is returned with no action. * * This is used to deal with a committed set of credentials. Although the * pointer is const, this will temporarily discard the const and increment the * usage count. The purpose of this is to attempt to catch at compile time the * accidental alteration of a set of credentials that should be considered * immutable. */ static inline const struct cred *get_cred(const struct cred *cred) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return cred; validate_creds(cred); nonconst_cred->non_rcu = 0; return get_new_cred(nonconst_cred); } static inline const struct cred *get_cred_rcu(const struct cred *cred) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return NULL; if (!atomic_long_inc_not_zero(&nonconst_cred->usage)) return NULL; validate_creds(cred); nonconst_cred->non_rcu = 0; return cred; } /** * put_cred - Release a reference to a set of credentials * @cred: The credentials to release * * Release a reference to a set of credentials, deleting them when the last ref * is released. If %NULL is passed, nothing is done. * * This takes a const pointer to a set of credentials because the credentials * on task_struct are attached by const pointers to prevent accidental * alteration of otherwise immutable credential sets. */ static inline void put_cred(const struct cred *_cred) { struct cred *cred = (struct cred *) _cred; if (cred) { validate_creds(cred); if (atomic_long_dec_and_test(&(cred)->usage)) __put_cred(cred); } } /** * current_cred - Access the current task's subjective credentials * * Access the subjective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_cred() \ rcu_dereference_protected(current->cred, 1) /** * current_real_cred - Access the current task's objective credentials * * Access the objective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_real_cred() \ rcu_dereference_protected(current->real_cred, 1) /** * __task_cred - Access a task's objective credentials * @task: The task to query * * Access the objective credentials of a task. The caller must hold the RCU * readlock. * * The result of this function should not be passed directly to get_cred(); * rather get_task_cred() should be used instead. */ #define __task_cred(task) \ rcu_dereference((task)->real_cred) /** * get_current_cred - Get the current task's subjective credentials * * Get the subjective credentials of the current task, pinning them so that * they can't go away. Accessing the current task's credentials directly is * not permitted. */ #define get_current_cred() \ (get_cred(current_cred())) /** * get_current_user - Get the current task's user_struct * * Get the user record of the current task, pinning it so that it can't go * away. */ #define get_current_user() \ ({ \ struct user_struct *__u; \ const struct cred *__cred; \ __cred = current_cred(); \ __u = get_uid(__cred->user); \ __u; \ }) /** * get_current_groups - Get the current task's supplementary group list * * Get the supplementary group list of the current task, pinning it so that it * can't go away. */ #define get_current_groups() \ ({ \ struct group_info *__groups; \ const struct cred *__cred; \ __cred = current_cred(); \ __groups = get_group_info(__cred->group_info); \ __groups; \ }) #define task_cred_xxx(task, xxx) \ ({ \ __typeof__(((struct cred *)NULL)->xxx) ___val; \ rcu_read_lock(); \ ___val = __task_cred((task))->xxx; \ rcu_read_unlock(); \ ___val; \ }) #define task_uid(task) (task_cred_xxx((task), uid)) #define task_euid(task) (task_cred_xxx((task), euid)) #define current_cred_xxx(xxx) \ ({ \ current_cred()->xxx; \ }) #define current_uid() (current_cred_xxx(uid)) #define current_gid() (current_cred_xxx(gid)) #define current_euid() (current_cred_xxx(euid)) #define current_egid() (current_cred_xxx(egid)) #define current_suid() (current_cred_xxx(suid)) #define current_sgid() (current_cred_xxx(sgid)) #define current_fsuid() (current_cred_xxx(fsuid)) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS #define current_user_ns() (current_cred_xxx(user_ns)) #else static inline struct user_namespace *current_user_ns(void) { return &init_user_ns; } #endif #define current_uid_gid(_uid, _gid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_uid) = __cred->uid; \ *(_gid) = __cred->gid; \ } while(0) #define current_euid_egid(_euid, _egid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_euid) = __cred->euid; \ *(_egid) = __cred->egid; \ } while(0) #define current_fsuid_fsgid(_fsuid, _fsgid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_fsuid) = __cred->fsuid; \ *(_fsgid) = __cred->fsgid; \ } while(0) #endif /* _LINUX_CRED_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PID_H #define _LINUX_PID_H #include <linux/rculist.h> #include <linux/wait.h> #include <linux/refcount.h> enum pid_type { PIDTYPE_PID, PIDTYPE_TGID, PIDTYPE_PGID, PIDTYPE_SID, PIDTYPE_MAX, }; /* * What is struct pid? * * A struct pid is the kernel's internal notion of a process identifier. * It refers to individual tasks, process groups, and sessions. While * there are processes attached to it the struct pid lives in a hash * table, so it and then the processes that it refers to can be found * quickly from the numeric pid value. The attached processes may be * quickly accessed by following pointers from struct pid. * * Storing pid_t values in the kernel and referring to them later has a * problem. The process originally with that pid may have exited and the * pid allocator wrapped, and another process could have come along * and been assigned that pid. * * Referring to user space processes by holding a reference to struct * task_struct has a problem. When the user space process exits * the now useless task_struct is still kept. A task_struct plus a * stack consumes around 10K of low kernel memory. More precisely * this is THREAD_SIZE + sizeof(struct task_struct). By comparison * a struct pid is about 64 bytes. * * Holding a reference to struct pid solves both of these problems. * It is small so holding a reference does not consume a lot of * resources, and since a new struct pid is allocated when the numeric pid * value is reused (when pids wrap around) we don't mistakenly refer to new * processes. */ /* * struct upid is used to get the id of the struct pid, as it is * seen in particular namespace. Later the struct pid is found with * find_pid_ns() using the int nr and struct pid_namespace *ns. */ struct upid { int nr; struct pid_namespace *ns; }; struct pid { refcount_t count; unsigned int level; spinlock_t lock; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; /* wait queue for pidfd notifications */ wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[1]; }; extern struct pid init_struct_pid; extern const struct file_operations pidfd_fops; struct file; extern struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); static inline struct pid *get_pid(struct pid *pid) { if (pid) refcount_inc(&pid->count); return pid; } extern void put_pid(struct pid *pid); extern struct task_struct *pid_task(struct pid *pid, enum pid_type); static inline bool pid_has_task(struct pid *pid, enum pid_type type) { return !hlist_empty(&pid->tasks[type]); } extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type); extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); /* * these helpers must be called with the tasklist_lock write-held. */ extern void attach_pid(struct task_struct *task, enum pid_type); extern void detach_pid(struct task_struct *task, enum pid_type); extern void change_pid(struct task_struct *task, enum pid_type, struct pid *pid); extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); struct pid_namespace; extern struct pid_namespace init_pid_ns; extern int pid_max; extern int pid_max_min, pid_max_max; /* * look up a PID in the hash table. Must be called with the tasklist_lock * or rcu_read_lock() held. * * find_pid_ns() finds the pid in the namespace specified * find_vpid() finds the pid by its virtual id, i.e. in the current namespace * * see also find_task_by_vpid() set in include/linux/sched.h */ extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns); extern struct pid *find_vpid(int nr); /* * Lookup a PID in the hash table, and return with it's count elevated. */ extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size); extern void free_pid(struct pid *pid); extern void disable_pid_allocation(struct pid_namespace *ns); /* * ns_of_pid() returns the pid namespace in which the specified pid was * allocated. * * NOTE: * ns_of_pid() is expected to be called for a process (task) that has * an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid * is expected to be non-NULL. If @pid is NULL, caller should handle * the resulting NULL pid-ns. */ static inline struct pid_namespace *ns_of_pid(struct pid *pid) { struct pid_namespace *ns = NULL; if (pid) ns = pid->numbers[pid->level].ns; return ns; } /* * is_child_reaper returns true if the pid is the init process * of the current namespace. As this one could be checked before * pid_ns->child_reaper is assigned in copy_process, we check * with the pid number. */ static inline bool is_child_reaper(struct pid *pid) { return pid->numbers[pid->level].nr == 1; } /* * the helpers to get the pid's id seen from different namespaces * * pid_nr() : global id, i.e. the id seen from the init namespace; * pid_vnr() : virtual id, i.e. the id seen from the pid namespace of * current. * pid_nr_ns() : id seen from the ns specified. * * see also task_xid_nr() etc in include/linux/sched.h */ static inline pid_t pid_nr(struct pid *pid) { pid_t nr = 0; if (pid) nr = pid->numbers[0].nr; return nr; } pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns); pid_t pid_vnr(struct pid *pid); #define do_each_pid_task(pid, type, task) \ do { \ if ((pid) != NULL) \ hlist_for_each_entry_rcu((task), \ &(pid)->tasks[type], pid_links[type]) { /* * Both old and new leaders may be attached to * the same pid in the middle of de_thread(). */ #define while_each_pid_task(pid, type, task) \ if (type == PIDTYPE_PID) \ break; \ } \ } while (0) #define do_each_pid_thread(pid, type, task) \ do_each_pid_task(pid, type, task) { \ struct task_struct *tg___ = task; \ for_each_thread(tg___, task) { #define while_each_pid_thread(pid, type, task) \ } \ task = tg___; \ } while_each_pid_task(pid, type, task) #endif /* _LINUX_PID_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/types.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/tty.h> #include <linux/fcntl.h> #include <linux/uaccess.h> #include "tty.h" static int is_ignored(int sig) { return (sigismember(&current->blocked, sig) || current->sighand->action[sig-1].sa.sa_handler == SIG_IGN); } /** * tty_check_change - check for POSIX terminal changes * @tty: tty to check * * If we try to write to, or set the state of, a terminal and we're * not in the foreground, send a SIGTTOU. If the signal is blocked or * ignored, go ahead and perform the operation. (POSIX 7.2) * * Locking: ctrl_lock */ int __tty_check_change(struct tty_struct *tty, int sig) { unsigned long flags; struct pid *pgrp, *tty_pgrp; int ret = 0; if (current->signal->tty != tty) return 0; rcu_read_lock(); pgrp = task_pgrp(current); spin_lock_irqsave(&tty->ctrl_lock, flags); tty_pgrp = tty->pgrp; spin_unlock_irqrestore(&tty->ctrl_lock, flags); if (tty_pgrp && pgrp != tty_pgrp) { if (is_ignored(sig)) { if (sig == SIGTTIN) ret = -EIO; } else if (is_current_pgrp_orphaned()) ret = -EIO; else { kill_pgrp(pgrp, sig, 1); set_thread_flag(TIF_SIGPENDING); ret = -ERESTARTSYS; } } rcu_read_unlock(); if (!tty_pgrp) tty_warn(tty, "sig=%d, tty->pgrp == NULL!\n", sig); return ret; } int tty_check_change(struct tty_struct *tty) { return __tty_check_change(tty, SIGTTOU); } EXPORT_SYMBOL(tty_check_change); void proc_clear_tty(struct task_struct *p) { unsigned long flags; struct tty_struct *tty; spin_lock_irqsave(&p->sighand->siglock, flags); tty = p->signal->tty; p->signal->tty = NULL; spin_unlock_irqrestore(&p->sighand->siglock, flags); tty_kref_put(tty); } /** * proc_set_tty - set the controlling terminal * * Only callable by the session leader and only if it does not already have * a controlling terminal. * * Caller must hold: tty_lock() * a readlock on tasklist_lock * sighand lock */ static void __proc_set_tty(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->ctrl_lock, flags); /* * The session and fg pgrp references will be non-NULL if * tiocsctty() is stealing the controlling tty */ put_pid(tty->session); put_pid(tty->pgrp); tty->pgrp = get_pid(task_pgrp(current)); tty->session = get_pid(task_session(current)); spin_unlock_irqrestore(&tty->ctrl_lock, flags); if (current->signal->tty) { tty_debug(tty, "current tty %s not NULL!!\n", current->signal->tty->name); tty_kref_put(current->signal->tty); } put_pid(current->signal->tty_old_pgrp); current->signal->tty = tty_kref_get(tty); current->signal->tty_old_pgrp = NULL; } static void proc_set_tty(struct tty_struct *tty) { spin_lock_irq(&current->sighand->siglock); __proc_set_tty(tty); spin_unlock_irq(&current->sighand->siglock); } /* * Called by tty_open() to set the controlling tty if applicable. */ void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty) { read_lock(&tasklist_lock); spin_lock_irq(&current->sighand->siglock); if (current->signal->leader && !current->signal->tty && tty->session == NULL) { /* * Don't let a process that only has write access to the tty * obtain the privileges associated with having a tty as * controlling terminal (being able to reopen it with full * access through /dev/tty, being able to perform pushback). * Many distributions set the group of all ttys to "tty" and * grant write-only access to all terminals for setgid tty * binaries, which should not imply full privileges on all ttys. * * This could theoretically break old code that performs open() * on a write-only file descriptor. In that case, it might be * necessary to also permit this if * inode_permission(inode, MAY_READ) == 0. */ if (filp->f_mode & FMODE_READ) __proc_set_tty(tty); } spin_unlock_irq(&current->sighand->siglock); read_unlock(&tasklist_lock); } struct tty_struct *get_current_tty(void) { struct tty_struct *tty; unsigned long flags; spin_lock_irqsave(&current->sighand->siglock, flags); tty = tty_kref_get(current->signal->tty); spin_unlock_irqrestore(&current->sighand->siglock, flags); return tty; } EXPORT_SYMBOL_GPL(get_current_tty); /* * Called from tty_release(). */ void session_clear_tty(struct pid *session) { struct task_struct *p; do_each_pid_task(session, PIDTYPE_SID, p) { proc_clear_tty(p); } while_each_pid_task(session, PIDTYPE_SID, p); } /** * tty_signal_session_leader - sends SIGHUP to session leader * @tty: controlling tty * @exit_session: if non-zero, signal all foreground group processes * * Send SIGHUP and SIGCONT to the session leader and its process group. * Optionally, signal all processes in the foreground process group. * * Returns the number of processes in the session with this tty * as their controlling terminal. This value is used to drop * tty references for those processes. */ int tty_signal_session_leader(struct tty_struct *tty, int exit_session) { struct task_struct *p; int refs = 0; struct pid *tty_pgrp = NULL; read_lock(&tasklist_lock); if (tty->session) { do_each_pid_task(tty->session, PIDTYPE_SID, p) { spin_lock_irq(&p->sighand->siglock); if (p->signal->tty == tty) { p->signal->tty = NULL; /* We defer the dereferences outside fo the tasklist lock */ refs++; } if (!p->signal->leader) { spin_unlock_irq(&p->sighand->siglock); continue; } __group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p); __group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p); put_pid(p->signal->tty_old_pgrp); /* A noop */ spin_lock(&tty->ctrl_lock); tty_pgrp = get_pid(tty->pgrp); if (tty->pgrp) p->signal->tty_old_pgrp = get_pid(tty->pgrp); spin_unlock(&tty->ctrl_lock); spin_unlock_irq(&p->sighand->siglock); } while_each_pid_task(tty->session, PIDTYPE_SID, p); } read_unlock(&tasklist_lock); if (tty_pgrp) { if (exit_session) kill_pgrp(tty_pgrp, SIGHUP, exit_session); put_pid(tty_pgrp); } return refs; } /** * disassociate_ctty - disconnect controlling tty * @on_exit: true if exiting so need to "hang up" the session * * This function is typically called only by the session leader, when * it wants to disassociate itself from its controlling tty. * * It performs the following functions: * (1) Sends a SIGHUP and SIGCONT to the foreground process group * (2) Clears the tty from being controlling the session * (3) Clears the controlling tty for all processes in the * session group. * * The argument on_exit is set to 1 if called when a process is * exiting; it is 0 if called by the ioctl TIOCNOTTY. * * Locking: * BTM is taken for hysterical raisons, and held when * called from no_tty(). * tty_mutex is taken to protect tty * ->siglock is taken to protect ->signal/->sighand * tasklist_lock is taken to walk process list for sessions * ->siglock is taken to protect ->signal/->sighand */ void disassociate_ctty(int on_exit) { struct tty_struct *tty; if (!current->signal->leader) return; tty = get_current_tty(); if (tty) { if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY) { tty_vhangup_session(tty); } else { struct pid *tty_pgrp = tty_get_pgrp(tty); if (tty_pgrp) { kill_pgrp(tty_pgrp, SIGHUP, on_exit); if (!on_exit) kill_pgrp(tty_pgrp, SIGCONT, on_exit); put_pid(tty_pgrp); } } tty_kref_put(tty); } else if (on_exit) { struct pid *old_pgrp; spin_lock_irq(&current->sighand->siglock); old_pgrp = current->signal->tty_old_pgrp; current->signal->tty_old_pgrp = NULL; spin_unlock_irq(&current->sighand->siglock); if (old_pgrp) { kill_pgrp(old_pgrp, SIGHUP, on_exit); kill_pgrp(old_pgrp, SIGCONT, on_exit); put_pid(old_pgrp); } return; } tty = get_current_tty(); if (tty) { unsigned long flags; tty_lock(tty); spin_lock_irqsave(&tty->ctrl_lock, flags); put_pid(tty->session); put_pid(tty->pgrp); tty->session = NULL; tty->pgrp = NULL; spin_unlock_irqrestore(&tty->ctrl_lock, flags); tty_unlock(tty); tty_kref_put(tty); } /* If tty->ctrl.pgrp is not NULL, it may be assigned to * current->signal->tty_old_pgrp in a race condition, and * cause pid memleak. Release current->signal->tty_old_pgrp * after tty->ctrl.pgrp set to NULL. */ spin_lock_irq(&current->sighand->siglock); put_pid(current->signal->tty_old_pgrp); current->signal->tty_old_pgrp = NULL; spin_unlock_irq(&current->sighand->siglock); /* Now clear signal->tty under the lock */ read_lock(&tasklist_lock); session_clear_tty(task_session(current)); read_unlock(&tasklist_lock); } /* * * no_tty - Ensure the current process does not have a controlling tty */ void no_tty(void) { /* FIXME: Review locking here. The tty_lock never covered any race between a new association and proc_clear_tty but possible we need to protect against this anyway */ struct task_struct *tsk = current; disassociate_ctty(0); proc_clear_tty(tsk); } /** * tiocsctty - set controlling tty * @tty: tty structure * @arg: user argument * * This ioctl is used to manage job control. It permits a session * leader to set this tty as the controlling tty for the session. * * Locking: * Takes tty_lock() to serialize proc_set_tty() for this tty * Takes tasklist_lock internally to walk sessions * Takes ->siglock() when updating signal->tty */ static int tiocsctty(struct tty_struct *tty, struct file *file, int arg) { int ret = 0; tty_lock(tty); read_lock(&tasklist_lock); if (current->signal->leader && (task_session(current) == tty->session)) goto unlock; /* * The process must be a session leader and * not have a controlling tty already. */ if (!current->signal->leader || current->signal->tty) { ret = -EPERM; goto unlock; } if (tty->session) { /* * This tty is already the controlling * tty for another session group! */ if (arg == 1 && capable(CAP_SYS_ADMIN)) { /* * Steal it away */ session_clear_tty(tty->session); } else { ret = -EPERM; goto unlock; } } /* See the comment in tty_open_proc_set_tty(). */ if ((file->f_mode & FMODE_READ) == 0 && !capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto unlock; } proc_set_tty(tty); unlock: read_unlock(&tasklist_lock); tty_unlock(tty); return ret; } /** * tty_get_pgrp - return a ref counted pgrp pid * @tty: tty to read * * Returns a refcounted instance of the pid struct for the process * group controlling the tty. */ struct pid *tty_get_pgrp(struct tty_struct *tty) { unsigned long flags; struct pid *pgrp; spin_lock_irqsave(&tty->ctrl_lock, flags); pgrp = get_pid(tty->pgrp); spin_unlock_irqrestore(&tty->ctrl_lock, flags); return pgrp; } EXPORT_SYMBOL_GPL(tty_get_pgrp); /* * This checks not only the pgrp, but falls back on the pid if no * satisfactory pgrp is found. I dunno - gdb doesn't work correctly * without this... * * The caller must hold rcu lock or the tasklist lock. */ static struct pid *session_of_pgrp(struct pid *pgrp) { struct task_struct *p; struct pid *sid = NULL; p = pid_task(pgrp, PIDTYPE_PGID); if (p == NULL) p = pid_task(pgrp, PIDTYPE_PID); if (p != NULL) sid = task_session(p); return sid; } /** * tiocgpgrp - get process group * @tty: tty passed by user * @real_tty: tty side of the tty passed by the user if a pty else the tty * @p: returned pid * * Obtain the process group of the tty. If there is no process group * return an error. * * Locking: none. Reference to current->signal->tty is safe. */ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) { struct pid *pid; int ret; /* * (tty == real_tty) is a cheap way of * testing if the tty is NOT a master pty. */ if (tty == real_tty && current->signal->tty != real_tty) return -ENOTTY; pid = tty_get_pgrp(real_tty); ret = put_user(pid_vnr(pid), p); put_pid(pid); return ret; } /** * tiocspgrp - attempt to set process group * @tty: tty passed by user * @real_tty: tty side device matching tty passed by user * @p: pid pointer * * Set the process group of the tty to the session passed. Only * permitted where the tty session is our session. * * Locking: RCU, ctrl lock */ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) { struct pid *pgrp; pid_t pgrp_nr; int retval = tty_check_change(real_tty); if (retval == -EIO) return -ENOTTY; if (retval) return retval; if (get_user(pgrp_nr, p)) return -EFAULT; if (pgrp_nr < 0) return -EINVAL; spin_lock_irq(&real_tty->ctrl_lock); if (!current->signal->tty || (current->signal->tty != real_tty) || (real_tty->session != task_session(current))) { retval = -ENOTTY; goto out_unlock_ctrl; } rcu_read_lock(); pgrp = find_vpid(pgrp_nr); retval = -ESRCH; if (!pgrp) goto out_unlock; retval = -EPERM; if (session_of_pgrp(pgrp) != task_session(current)) goto out_unlock; retval = 0; put_pid(real_tty->pgrp); real_tty->pgrp = get_pid(pgrp); out_unlock: rcu_read_unlock(); out_unlock_ctrl: spin_unlock_irq(&real_tty->ctrl_lock); return retval; } /** * tiocgsid - get session id * @tty: tty passed by user * @real_tty: tty side of the tty passed by the user if a pty else the tty * @p: pointer to returned session id * * Obtain the session id of the tty. If there is no session * return an error. */ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) { unsigned long flags; pid_t sid; /* * (tty == real_tty) is a cheap way of * testing if the tty is NOT a master pty. */ if (tty == real_tty && current->signal->tty != real_tty) return -ENOTTY; spin_lock_irqsave(&real_tty->ctrl_lock, flags); if (!real_tty->session) goto err; sid = pid_vnr(real_tty->session); spin_unlock_irqrestore(&real_tty->ctrl_lock, flags); return put_user(sid, p); err: spin_unlock_irqrestore(&real_tty->ctrl_lock, flags); return -ENOTTY; } /* * Called from tty_ioctl(). If tty is a pty then real_tty is the slave side, * if not then tty == real_tty. */ long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty, struct file *file, unsigned int cmd, unsigned long arg) { void __user *p = (void __user *)arg; switch (cmd) { case TIOCNOTTY: if (current->signal->tty != tty) return -ENOTTY; no_tty(); return 0; case TIOCSCTTY: return tiocsctty(real_tty, file, arg); case TIOCGPGRP: return tiocgpgrp(tty, real_tty, p); case TIOCSPGRP: return tiocspgrp(tty, real_tty, p); case TIOCGSID: return tiocgsid(tty, real_tty, p); } return -ENOIOCTLCMD; }
2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 // SPDX-License-Identifier: GPL-2.0 /* * trace event based perf event profiling/tracing * * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> */ #include <linux/module.h> #include <linux/kprobes.h> #include <linux/security.h> #include "trace.h" #include "trace_probe.h" static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses * suprises */ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) perf_trace_t; /* Count the events in use (per event id, not per instance) */ static int total_ref_count; static int perf_trace_event_perm(struct trace_event_call *tp_event, struct perf_event *p_event) { int ret; if (tp_event->perf_perm) { ret = tp_event->perf_perm(tp_event, p_event); if (ret) return ret; } /* * We checked and allowed to create parent, * allow children without checking. */ if (p_event->parent) return 0; /* * It's ok to check current process (owner) permissions in here, * because code below is called only via perf_event_open syscall. */ /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { ret = perf_allow_tracepoint(&p_event->attr); if (ret) return ret; if (!is_sampling_event(p_event)) return 0; /* * We don't allow user space callchains for function trace * event, due to issues with page faults while tracing page * fault handler and its overall trickiness nature. */ if (!p_event->attr.exclude_callchain_user) return -EINVAL; /* * Same reason to disable user stack dump as for user space * callchains above. */ if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) return -EINVAL; } /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) return 0; /* Some events are ok to be traced by non-root users... */ if (p_event->attach_state == PERF_ATTACH_TASK) { if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) return 0; } /* * ...otherwise raw tracepoint data can be a severe data leak, * only allow root to have these. */ ret = perf_allow_tracepoint(&p_event->attr); if (ret) return ret; return 0; } static int perf_trace_event_reg(struct trace_event_call *tp_event, struct perf_event *p_event) { struct hlist_head __percpu *list; int ret = -ENOMEM; int cpu; p_event->tp_event = tp_event; if (tp_event->perf_refcount++ > 0) return 0; list = alloc_percpu(struct hlist_head); if (!list) goto fail; for_each_possible_cpu(cpu) INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); tp_event->perf_events = list; if (!total_ref_count) { char __percpu *buf; int i; for (i = 0; i < PERF_NR_CONTEXTS; i++) { buf = (char __percpu *)alloc_percpu(perf_trace_t); if (!buf) goto fail; perf_trace_buf[i] = buf; } } ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); if (ret) goto fail; total_ref_count++; return 0; fail: if (!total_ref_count) { int i; for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } } if (!--tp_event->perf_refcount) { free_percpu(tp_event->perf_events); tp_event->perf_events = NULL; } return ret; } static void perf_trace_event_unreg(struct perf_event *p_event) { struct trace_event_call *tp_event = p_event->tp_event; int i; if (--tp_event->perf_refcount > 0) goto out; tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); /* * Ensure our callback won't be called anymore. The buffers * will be freed after that. */ tracepoint_synchronize_unregister(); free_percpu(tp_event->perf_events); tp_event->perf_events = NULL; if (!--total_ref_count) { for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } } out: module_put(tp_event->mod); } static int perf_trace_event_open(struct perf_event *p_event) { struct trace_event_call *tp_event = p_event->tp_event; return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); } static void perf_trace_event_close(struct perf_event *p_event) { struct trace_event_call *tp_event = p_event->tp_event; tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); } static int perf_trace_event_init(struct trace_event_call *tp_event, struct perf_event *p_event) { int ret; ret = perf_trace_event_perm(tp_event, p_event); if (ret) return ret; ret = perf_trace_event_reg(tp_event, p_event); if (ret) return ret; ret = perf_trace_event_open(p_event); if (ret) { perf_trace_event_unreg(p_event); return ret; } return 0; } int perf_trace_init(struct perf_event *p_event) { struct trace_event_call *tp_event; u64 event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && tp_event->class && tp_event->class->reg && try_module_get(tp_event->mod)) { ret = perf_trace_event_init(tp_event, p_event); if (ret) module_put(tp_event->mod); break; } } mutex_unlock(&event_mutex); return ret; } void perf_trace_destroy(struct perf_event *p_event) { mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); mutex_unlock(&event_mutex); } #ifdef CONFIG_KPROBE_EVENTS int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) { int ret; char *func = NULL; struct trace_event_call *tp_event; if (p_event->attr.kprobe_func) { func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); if (!func) return -ENOMEM; ret = strncpy_from_user( func, u64_to_user_ptr(p_event->attr.kprobe_func), KSYM_NAME_LEN); if (ret == KSYM_NAME_LEN) ret = -E2BIG; if (ret < 0) goto out; if (func[0] == '\0') { kfree(func); func = NULL; } } tp_event = create_local_trace_kprobe( func, (void *)(unsigned long)(p_event->attr.kprobe_addr), p_event->attr.probe_offset, is_retprobe); if (IS_ERR(tp_event)) { ret = PTR_ERR(tp_event); goto out; } mutex_lock(&event_mutex); ret = perf_trace_event_init(tp_event, p_event); if (ret) destroy_local_trace_kprobe(tp_event); mutex_unlock(&event_mutex); out: kfree(func); return ret; } void perf_kprobe_destroy(struct perf_event *p_event) { mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); } #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS int perf_uprobe_init(struct perf_event *p_event, unsigned long ref_ctr_offset, bool is_retprobe) { int ret; char *path = NULL; struct trace_event_call *tp_event; if (!p_event->attr.uprobe_path) return -EINVAL; path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); if (IS_ERR(path)) { ret = PTR_ERR(path); return (ret == -EINVAL) ? -E2BIG : ret; } if (path[0] == '\0') { ret = -EINVAL; goto out; } tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset, ref_ctr_offset, is_retprobe); if (IS_ERR(tp_event)) { ret = PTR_ERR(tp_event); goto out; } /* * local trace_uprobe need to hold event_mutex to call * uprobe_buffer_enable() and uprobe_buffer_disable(). * event_mutex is not required for local trace_kprobes. */ mutex_lock(&event_mutex); ret = perf_trace_event_init(tp_event, p_event); if (ret) destroy_local_trace_uprobe(tp_event); mutex_unlock(&event_mutex); out: kfree(path); return ret; } void perf_uprobe_destroy(struct perf_event *p_event) { mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); mutex_unlock(&event_mutex); destroy_local_trace_uprobe(p_event->tp_event); } #endif /* CONFIG_UPROBE_EVENTS */ int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; if (!(flags & PERF_EF_START)) p_event->hw.state = PERF_HES_STOPPED; /* * If TRACE_REG_PERF_ADD returns false; no custom action was performed * and we need to take the default action of enqueueing our event on * the right per-cpu hlist. */ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) { struct hlist_head __percpu *pcpu_list; struct hlist_head *list; pcpu_list = tp_event->perf_events; if (WARN_ON_ONCE(!pcpu_list)) return -EINVAL; list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); } return 0; } void perf_trace_del(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; /* * If TRACE_REG_PERF_DEL returns false; no custom action was performed * and we need to take the default action of dequeueing our event from * the right per-cpu hlist. */ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event)) hlist_del_rcu(&p_event->hlist_entry); } void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) { char *raw_data; int rctx; BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "perf buffer not large enough")) return NULL; *rctxp = rctx = perf_swevent_get_recursion_context(); if (rctx < 0) return NULL; if (regs) *regs = this_cpu_ptr(&__perf_regs[rctx]); raw_data = this_cpu_ptr(perf_trace_buf[rctx]); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); return raw_data; } EXPORT_SYMBOL_GPL(perf_trace_buf_alloc); NOKPROBE_SYMBOL(perf_trace_buf_alloc); void perf_trace_buf_update(void *record, u16 type) { struct trace_entry *entry = record; int pc = preempt_count(); unsigned long flags; local_save_flags(flags); tracing_generic_entry_update(entry, type, flags, pc); } NOKPROBE_SYMBOL(perf_trace_buf_update); #ifdef CONFIG_FUNCTION_TRACER static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *pt_regs) { struct ftrace_entry *entry; struct perf_event *event; struct hlist_head head; struct pt_regs regs; int rctx; if ((unsigned long)ops->private != smp_processor_id()) return; event = container_of(ops, struct perf_event, ftrace_ops); /* * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all * the perf code does is hlist_for_each_entry_rcu(), so we can * get away with simply setting the @head.first pointer in order * to create a singular list. */ head.first = &event->hlist_entry; #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ sizeof(u64)) - sizeof(u32)) BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); memset(&regs, 0, sizeof(regs)); perf_fetch_caller_regs(&regs); entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); if (!entry) return; entry->ip = ip; entry->parent_ip = parent_ip; perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, 1, &regs, &head, NULL); #undef ENTRY_SIZE } static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; ops->flags = FTRACE_OPS_FL_RCU; ops->func = perf_ftrace_function_call; ops->private = (void *)(unsigned long)nr_cpu_ids; return register_ftrace_function(ops); } static int perf_ftrace_function_unregister(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; int ret = unregister_ftrace_function(ops); ftrace_free_filter(ops); return ret; } int perf_ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data) { struct perf_event *event = data; switch (type) { case TRACE_REG_REGISTER: case TRACE_REG_UNREGISTER: break; case TRACE_REG_PERF_REGISTER: case TRACE_REG_PERF_UNREGISTER: return 0; case TRACE_REG_PERF_OPEN: return perf_ftrace_function_register(data); case TRACE_REG_PERF_CLOSE: return perf_ftrace_function_unregister(data); case TRACE_REG_PERF_ADD: event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id(); return 1; case TRACE_REG_PERF_DEL: event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids; return 1; } return -EINVAL; } #endif /* CONFIG_FUNCTION_TRACER */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_SIGNAL_H #define _LINUX_SCHED_SIGNAL_H #include <linux/rculist.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/jobctl.h> #include <linux/sched/task.h> #include <linux/cred.h> #include <linux/refcount.h> #include <linux/posix-timers.h> #include <linux/mm_types.h> #include <asm/ptrace.h> /* * Types defining task->signal and task->sighand and APIs using them: */ struct sighand_struct { spinlock_t siglock; refcount_t count; wait_queue_head_t signalfd_wqh; struct k_sigaction action[_NSIG]; }; /* * Per-process accounting stats: */ struct pacct_struct { int ac_flag; long ac_exitcode; unsigned long ac_mem; u64 ac_utime, ac_stime; unsigned long ac_minflt, ac_majflt; }; struct cpu_itimer { u64 expires; u64 incr; }; /* * This is the atomic variant of task_cputime, which can be used for * storing and updating task_cputime statistics without locking. */ struct task_cputime_atomic { atomic64_t utime; atomic64_t stime; atomic64_t sum_exec_runtime; }; #define INIT_CPUTIME_ATOMIC \ (struct task_cputime_atomic) { \ .utime = ATOMIC64_INIT(0), \ .stime = ATOMIC64_INIT(0), \ .sum_exec_runtime = ATOMIC64_INIT(0), \ } /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. */ struct thread_group_cputimer { struct task_cputime_atomic cputime_atomic; }; struct multiprocess_signals { sigset_t signal; struct hlist_node node; }; /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always * implies a shared sighand_struct, so locking * sighand_struct is always a proper superset of * the locking of signal_struct. */ struct signal_struct { refcount_t sigcnt; atomic_t live; int nr_threads; struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ /* current thread group signal load-balancing target: */ struct task_struct *curr_target; /* shared signal handling: */ struct sigpending shared_pending; /* For collecting multiprocess signals during fork */ struct hlist_head multiprocess; /* thread group exit support */ int group_exit_code; /* overloaded: * - notify group_exit_task when ->count is equal to notify_count * - everyone except group_exit_task is stopped during signal delivery * of fatal signals, group_exit_task processes the signal. */ int notify_count; struct task_struct *group_exit_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes * to this process instead of 'init'. The service manager is * able to receive SIGCHLD signals and is able to investigate * the process until it calls wait(). All children of this * process will inherit a flag if they should look for a * child_subreaper process at exit. */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ unsigned int next_posix_timer_id; struct list_head posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; ktime_t it_real_incr; /* * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these * values are defined to 0 and 1 respectively */ struct cpu_itimer it[2]; /* * Thread group totals for process CPU timers. * See thread_group_cputimer(), et al, for details. */ struct thread_group_cputimer cputimer; #endif /* Empty if CONFIG_POSIX_TIMERS=n */ struct posix_cputimers posix_cputimers; /* PID/PID hash table linkage. */ struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif struct pid *tty_old_pgrp; /* boolean value for session group leader */ int leader; struct tty_struct *tty; /* NULL if no tty */ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif /* * Cumulative resource counters for dead threads in the group, * and for reaped dead child processes forked by this group. * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ seqlock_t stats_lock; u64 utime, stime, cutime, cstime; u64 gtime; u64 cgtime; struct prev_cputime prev_cputime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; unsigned long maxrss, cmaxrss; struct task_io_accounting ioac; /* * Cumulative ns of schedule CPU time fo dead threads in the * group, not including a zombie group leader, (This only differs * from jiffies_to_ns(utime + stime) if sched_clock uses something * other than jiffies.) */ unsigned long long sum_sched_runtime; /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs * to get both rlim_cur and rlim_max atomically, and either one * alone is a single word that can safely be read normally. * getrlimit/setrlimit use task_lock(current->group_leader) to * protect this instead of the siglock, because they really * have no need to disable irqs. */ struct rlimit rlim[RLIM_NLIMITS]; #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif #ifdef CONFIG_TASKSTATS struct taskstats *stats; #endif #ifdef CONFIG_AUDIT unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif /* * Thread is the potential origin of an oom condition; kill first on * oom */ bool oom_flag_origin; short oom_score_adj; /* OOM kill score adjustment */ short oom_score_adj_min; /* OOM kill score adjustment min value. * Only settable by CAP_SYS_RESOURCE. */ struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) * Deprecated do not use in new code. * Use exec_update_lock instead. */ struct rw_semaphore exec_update_lock; /* Held while task_struct is * being updated during exec, * and may have inconsistent * permissions. */ } __randomize_layout; /* * Bits in flags field of signal_struct. */ #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ #define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */ /* * Pending notifications to parent. */ #define SIGNAL_CLD_STOPPED 0x00000010 #define SIGNAL_CLD_CONTINUED 0x00000020 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ #define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ SIGNAL_STOP_CONTINUED) static inline void signal_set_stop_flags(struct signal_struct *sig, unsigned int flags) { WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { return (sig->flags & SIGNAL_GROUP_EXIT) || (sig->group_exit_task != NULL); } extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(struct task_struct *task, sigset_t *mask, kernel_siginfo_t *info); static inline int kernel_dequeue_signal(void) { struct task_struct *task = current; kernel_siginfo_t __info; int ret; spin_lock_irq(&task->sighand->siglock); ret = dequeue_signal(task, &task->blocked, &__info); spin_unlock_irq(&task->sighand->siglock); return ret; } static inline void kernel_signal_stop(void) { spin_lock_irq(&current->sighand->siglock); if (current->jobctl & JOBCTL_STOP_DEQUEUED) set_special_state(TASK_STOPPED); spin_unlock_irq(&current->sighand->siglock); schedule(); } #ifdef __ARCH_SI_TRAPNO # define ___ARCH_SI_TRAPNO(_a1) , _a1 #else # define ___ARCH_SI_TRAPNO(_a1) #endif #ifdef __ia64__ # define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3 #else # define ___ARCH_SI_IA64(_a1, _a2, _a3) #endif int force_sig_fault_to_task(int sig, int code, void __user *addr ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t); int force_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)); int send_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t); int force_sig_mceerr(int code, void __user *, short); int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); int force_sig_ptrace_errno_trap(int errno, void __user *addr); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); extern int force_sig_info(struct kernel_siginfo *); extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *, const struct cred *); extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline int restart_syscall(void) { set_tsk_thread_flag(current, TIF_SIGPENDING); return -ERESTARTNOINTR; } static inline int task_sigpending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } static inline int signal_pending(struct task_struct *p) { /* * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same * behavior in terms of ensuring that we break out of wait loops * so that notify signal callbacks can be processed. */ if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) return 1; return task_sigpending(p); } static inline int __fatal_signal_pending(struct task_struct *p) { return unlikely(sigismember(&p->pending.signal, SIGKILL)); } static inline int fatal_signal_pending(struct task_struct *p) { return task_sigpending(p) && __fatal_signal_pending(p); } static inline int signal_pending_state(long state, struct task_struct *p) { if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) return 0; if (!signal_pending(p)) return 0; return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } /* * This should only be used in fault handlers to decide whether we * should stop the current fault routine to handle the signals * instead, especially with the case where we've got interrupted with * a VM_FAULT_RETRY. */ static inline bool fault_signal_pending(vm_fault_t fault_flags, struct pt_regs *regs) { return unlikely((fault_flags & VM_FAULT_RETRY) && (fatal_signal_pending(current) || (user_mode(regs) && signal_pending(current)))); } /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); static inline void signal_wake_up(struct task_struct *t, bool resume) { signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { signal_wake_up_state(t, resume ? __TASK_TRACED : 0); } void task_join_group_stop(struct task_struct *task); #ifdef TIF_RESTORE_SIGMASK /* * Legacy restore_sigmask accessors. These are inefficient on * SMP architectures because they require atomic operations. */ /** * set_restore_sigmask() - make sure saved_sigmask processing gets done * * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code * will run before returning to user mode, to process the flag. For * all callers, TIF_SIGPENDING is already set or it's no harm to set * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the * arch code will notice on return to user mode, in case those bits * are scarce. We set TIF_SIGPENDING here to ensure that the arch * signal code always gets run when TIF_RESTORE_SIGMASK is set. */ static inline void set_restore_sigmask(void) { set_thread_flag(TIF_RESTORE_SIGMASK); } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline bool test_restore_sigmask(void) { return test_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_and_clear_restore_sigmask(void) { return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); } #else /* TIF_RESTORE_SIGMASK */ /* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ static inline void set_restore_sigmask(void) { current->restore_sigmask = true; } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { task->restore_sigmask = false; } static inline void clear_restore_sigmask(void) { current->restore_sigmask = false; } static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return task->restore_sigmask; } static inline bool test_and_clear_restore_sigmask(void) { if (!current->restore_sigmask) return false; current->restore_sigmask = false; return true; } #endif static inline void restore_saved_sigmask(void) { if (test_and_clear_restore_sigmask()) __set_current_blocked(&current->saved_sigmask); } extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); static inline void restore_saved_sigmask_unless(bool interrupted) { if (interrupted) WARN_ON(!signal_pending(current)); else restore_saved_sigmask(); } static inline sigset_t *sigmask_to_save(void) { sigset_t *res = &current->blocked; if (unlikely(test_restore_sigmask())) res = &current->saved_sigmask; return res; } static inline int kill_cad_pid(int sig, int priv) { return kill_pid(cad_pid, sig, priv); } /* These can be the second arg to send_sig_info/send_group_sig_info. */ #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0) #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1) static inline int __on_sig_stack(unsigned long sp) { #ifdef CONFIG_STACK_GROWSUP return sp >= current->sas_ss_sp && sp - current->sas_ss_sp < current->sas_ss_size; #else return sp > current->sas_ss_sp && sp - current->sas_ss_sp <= current->sas_ss_size; #endif } /* * True if we are on the alternate signal stack. */ static inline int on_sig_stack(unsigned long sp) { /* * If the signal stack is SS_AUTODISARM then, by construction, we * can't be on the signal stack unless user code deliberately set * SS_AUTODISARM when we were already on it. * * This improves reliability: if user state gets corrupted such that * the stack pointer points very close to the end of the signal stack, * then this check will enable the signal to be handled anyway. */ if (current->sas_ss_flags & SS_AUTODISARM) return 0; return __on_sig_stack(sp); } static inline int sas_ss_flags(unsigned long sp) { if (!current->sas_ss_size) return SS_DISABLE; return on_sig_stack(sp) ? SS_ONSTACK : 0; } static inline void sas_ss_reset(struct task_struct *p) { p->sas_ss_sp = 0; p->sas_ss_size = 0; p->sas_ss_flags = SS_DISABLE; } static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) { if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) #ifdef CONFIG_STACK_GROWSUP return current->sas_ss_sp; #else return current->sas_ss_sp + current->sas_ss_size; #endif return sp; } extern void __cleanup_sighand(struct sighand_struct *); extern void flush_itimer_signals(void); #define tasklist_empty() \ list_empty(&init_task.tasks) #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) extern bool current_is_single_threaded(void); /* * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. */ #define do_each_thread(g, t) \ for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) /* Careful: this is a double loop, 'break' won't work as expected. */ #define for_each_process_thread(p, t) \ for_each_process(p) for_each_thread(p, t) typedef int (*proc_visitor)(struct task_struct *p, void *data); void walk_process_tree(struct task_struct *top, proc_visitor, void *); static inline struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { struct pid *pid; if (type == PIDTYPE_PID) pid = task_pid(task); else pid = task->signal->pids[type]; return pid; } static inline struct pid *task_tgid(struct task_struct *task) { return task->signal->pids[PIDTYPE_TGID]; } /* * Without tasklist or RCU lock it is not safe to dereference * the result of task_pgrp/task_session even if task == current, * we can race with another thread doing sys_setsid/sys_setpgid. */ static inline struct pid *task_pgrp(struct task_struct *task) { return task->signal->pids[PIDTYPE_PGID]; } static inline struct pid *task_session(struct task_struct *task) { return task->signal->pids[PIDTYPE_SID]; } static inline int get_nr_threads(struct task_struct *task) { return task->signal->nr_threads; } static inline bool thread_group_leader(struct task_struct *p) { return p->exit_signal >= 0; } static inline bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { return p1->signal == p2->signal; } static inline struct task_struct *next_thread(const struct task_struct *p) { return list_entry_rcu(p->thread_group.next, struct task_struct, thread_group); } static inline int thread_group_empty(struct task_struct *p) { return list_empty(&p->thread_group); } #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) extern bool thread_group_exited(struct pid *pid); extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); static inline struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) { struct sighand_struct *ret; ret = __lock_task_sighand(task, flags); (void)__cond_lock(&task->sighand->siglock, ret); return ret; } static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) { spin_unlock_irqrestore(&task->sighand->siglock, *flags); } static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) { return task_rlimit(current, limit); } static inline unsigned long rlimit_max(unsigned int limit) { return task_rlimit_max(current, limit); } #endif /* _LINUX_SCHED_SIGNAL_H */
2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589 12590 12591 12592 12593 12594 12595 12596 12597 12598 12599 12600 12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734 12735 12736 12737 12738 12739 12740 12741 12742 12743 12744 12745 12746 12747 12748 12749 12750 12751 12752 12753 12754 12755 12756 12757 12758 12759 12760 12761 12762 12763 12764 12765 12766 12767 12768 12769 12770 12771 12772 12773 12774 12775 12776 12777 12778 12779 12780 12781 12782 12783 12784 12785 12786 12787 12788 12789 12790 12791 12792 12793 12794 12795 12796 12797 12798 12799 12800 12801 12802 12803 12804 12805 12806 12807 12808 12809 12810 12811 12812 12813 12814 12815 12816 12817 12818 12819 12820 12821 12822 12823 12824 12825 12826 12827 12828 12829 12830 12831 12832 12833 12834 12835 12836 12837 12838 12839 12840 12841 12842 12843 12844 12845 12846 12847 12848 12849 12850 12851 12852 12853 12854 12855 12856 12857 12858 12859 12860 12861 12862 12863 12864 12865 12866 12867 12868 12869 12870 12871 12872 12873 12874 12875 12876 12877 12878 12879 12880 12881 12882 12883 12884 12885 12886 12887 12888 12889 12890 12891 12892 12893 12894 12895 12896 12897 12898 12899 12900 12901 12902 12903 12904 12905 12906 12907 12908 12909 12910 12911 12912 12913 12914 12915 12916 12917 12918 12919 12920 12921 12922 12923 12924 12925 12926 12927 12928 12929 12930 12931 12932 12933 12934 12935 12936 12937 12938 12939 12940 12941 12942 12943 12944 12945 12946 12947 12948 12949 12950 12951 12952 12953 12954 12955 12956 12957 12958 12959 12960 12961 12962 12963 12964 12965 12966 12967 12968 12969 12970 12971 12972 12973 12974 12975 12976 12977 12978 12979 12980 12981 12982 12983 12984 12985 12986 12987 12988 12989 12990 12991 12992 12993 12994 12995 12996 12997 12998 12999 13000 13001 13002 13003 13004 13005 13006 13007 13008 13009 13010 13011 13012 13013 13014 13015 13016 13017 13018 13019 13020 13021 13022 13023 13024 13025 13026 13027 13028 13029 13030 13031 13032 13033 13034 13035 13036 13037 13038 13039 13040 13041 13042 13043 13044 13045 13046 13047 13048 13049 13050 13051 13052 13053 13054 13055 13056 13057 13058 13059 13060 13061 13062 13063 13064 13065 13066 13067 13068 13069 13070 13071 13072 13073 13074 13075 13076 13077 13078 13079 13080 13081 13082 13083 13084 13085 13086 13087 13088 13089 13090 13091 13092 13093 13094 13095 13096 13097 13098 13099 13100 13101 13102 13103 13104 13105 13106 13107 13108 13109 13110 13111 13112 13113 13114 13115 13116 13117 13118 13119 13120 13121 13122 13123 13124 13125 13126 13127 13128 13129 13130 13131 13132 13133 13134 13135 13136 13137 13138 13139 13140 13141 13142 13143 13144 13145 13146 13147 13148 13149 13150 13151 13152 13153 13154 13155 13156 13157 13158 13159 13160 13161 13162 13163 13164 13165 13166 13167 13168 13169 13170 13171 13172 13173 13174 13175 13176 13177 13178 13179 13180 13181 13182 13183 13184 13185 13186 13187 13188 13189 13190 13191 13192 13193 13194 13195 13196 13197 13198 13199 13200 13201 13202 13203 13204 13205 13206 13207 13208 13209 13210 13211 13212 13213 13214 13215 13216 13217 13218 13219 13220 13221 13222 13223 13224 13225 13226 13227 13228 13229 13230 13231 13232 13233 13234 13235 13236 13237 13238 13239 13240 13241 13242 13243 13244 13245 13246 13247 13248 13249 13250 13251 13252 13253 13254 13255 13256 13257 13258 13259 13260 13261 13262 13263 13264 13265 13266 13267 13268 13269 13270 13271 13272 13273 13274 13275 13276 13277 13278 13279 13280 13281 13282 13283 13284 13285 13286 13287 13288 13289 13290 13291 13292 13293 13294 13295 13296 13297 13298 13299 13300 13301 13302 13303 13304 13305 13306 13307 13308 13309 13310 13311 13312 13313 13314 13315 13316 13317 13318 13319 13320 13321 13322 13323 13324 13325 13326 13327 13328 13329 13330 13331 13332 13333 13334 13335 13336 13337 13338 13339 13340 13341 13342 // SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/fs.h> #include <linux/mm.h> #include <linux/cpu.h> #include <linux/smp.h> #include <linux/idr.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/tick.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> #include <linux/ptrace.h> #include <linux/reboot.h> #include <linux/vmstat.h> #include <linux/device.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/hugetlb.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/cgroup.h> #include <linux/perf_event.h> #include <linux/trace_events.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> #include <linux/module.h> #include <linux/mman.h> #include <linux/compat.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/namei.h> #include <linux/parser.h> #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/proc_ns.h> #include <linux/mount.h> #include <linux/min_heap.h> #include "internal.h" #include <asm/irq_regs.h> typedef int (*remote_function_f)(void *); struct remote_function_call { struct task_struct *p; remote_function_f func; void *info; int ret; }; static void remote_function(void *data) { struct remote_function_call *tfc = data; struct task_struct *p = tfc->p; if (p) { /* -EAGAIN */ if (task_cpu(p) != smp_processor_id()) return; /* * Now that we're on right CPU with IRQs disabled, we can test * if we hit the right task without races. */ tfc->ret = -ESRCH; /* No such (running) process */ if (p != current) return; } tfc->ret = tfc->func(tfc->info); } /** * task_function_call - call a function on the cpu on which a task runs * @p: the task to evaluate * @func: the function to be called * @info: the function call argument * * Calls the function @func when the task is currently running. This might * be on the current CPU, which just calls the function directly. This will * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) { struct remote_function_call data = { .p = p, .func = func, .info = info, .ret = -EAGAIN, }; int ret; for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); if (!ret) ret = data.ret; if (ret != -EAGAIN) break; cond_resched(); } return ret; } /** * cpu_function_call - call a function on the cpu * @func: the function to be called * @info: the function call argument * * Calls the function @func on the remote cpu. * * returns: @func return value or -ENXIO when the cpu is offline */ static int cpu_function_call(int cpu, remote_function_f func, void *info) { struct remote_function_call data = { .p = NULL, .func = func, .info = info, .ret = -ENXIO, /* No such CPU */ }; smp_call_function_single(cpu, remote_function, &data, 1); return data.ret; } static inline struct perf_cpu_context * __get_cpu_context(struct perf_event_context *ctx) { return this_cpu_ptr(ctx->pmu->pmu_cpu_context); } static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { raw_spin_lock(&cpuctx->ctx.lock); if (ctx) raw_spin_lock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) raw_spin_unlock(&ctx->lock); raw_spin_unlock(&cpuctx->ctx.lock); } #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { return READ_ONCE(event->owner) == TASK_TOMBSTONE; } /* * On task ctx scheduling... * * When !ctx->nr_events a task context will not be scheduled. This means * we can disable the scheduler hooks (for performance) without leaving * pending task ctx state. * * This however results in two special cases: * * - removing the last event from a task ctx; this is relatively straight * forward and is done in __perf_remove_from_context. * * - adding the first event to a task ctx; this is tricky because we cannot * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, struct perf_event_context *, void *); struct event_function_struct { struct perf_event *event; event_f func; void *data; }; static int event_function(void *info) { struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* * Since we do the IPI call without holding ctx->lock things can have * changed, double check we hit the task we set out to hit. */ if (ctx->task) { if (ctx->task != current) { ret = -ESRCH; goto unlock; } /* * We only use event_function_call() on established contexts, * and event_function() is only ever called when active (or * rather, we'll have bailed in task_function_call() or the * above ctx->task != current test), therefore we must have * ctx->is_active here. */ WARN_ON_ONCE(!ctx->is_active); /* * And since we have ctx->is_active, cpuctx->task_ctx must * match. */ WARN_ON_ONCE(task_ctx != ctx); } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } efs->func(event, cpuctx, ctx, efs->data); unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ struct event_function_struct efs = { .event = event, .func = func, .data = data, }; if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); } if (!task) { cpu_function_call(event->cpu, event_function, &efs); return; } if (task == TASK_TOMBSTONE) return; again: if (!task_function_call(task, event_function, &efs)) return; raw_spin_lock_irq(&ctx->lock); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; if (task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); return; } if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto again; } func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } /* * Similar to event_function_call() + event_function(), but hard assumes IRQs * are already disabled and we're on the right CPU. */ static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) return; task_ctx = ctx; } perf_ctx_lock(cpuctx, task_ctx); task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (task) { /* * We must be either inactive or active and the right task, * otherwise we're screwed, since we cannot IPI to somewhere * else. */ if (ctx->is_active) { if (WARN_ON_ONCE(task != current)) goto unlock; if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) goto unlock; } } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } func(event, cpuctx, ctx, data); unlock: perf_ctx_unlock(cpuctx, task_ctx); } #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ PERF_FLAG_FD_CLOEXEC) /* * branch priv levels that need permission checks */ #define PERF_SAMPLE_BRANCH_PERM_PLM \ (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_TIME = 0x4, /* see ctx_resched() for details */ EVENT_CPU = 0x8, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; /* * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ static void perf_sched_delayed(struct work_struct *work); DEFINE_STATIC_KEY_FALSE(perf_sched_events); static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; /* * perf event paranoia level: * -1 - not paranoid at all * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ int sysctl_perf_event_paranoid __read_mostly = 2; /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ /* * max perf event sample rate */ #define DEFAULT_MAX_SAMPLE_RATE 100000 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; tmp = div_u64(tmp, 100); if (!tmp) tmp = 1; WRITE_ONCE(perf_sample_allowed_ns, tmp); } static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; /* * If throttling is disabled don't allow the write: */ if (write && (perf_cpu == 100 || perf_cpu == 0)) return -EINVAL; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); return 0; } int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; if (sysctl_perf_cpu_time_max_percent == 100 || sysctl_perf_cpu_time_max_percent == 0) { printk(KERN_WARNING "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); WRITE_ONCE(perf_sample_allowed_ns, 0); } else { update_perf_cpu_limits(); } return 0; } /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not * get any real work done. This will drop the sample rate when * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); static u64 __report_avg; static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { u64 max_len = READ_ONCE(perf_sample_allowed_ns); u64 running_len; u64 avg_len; u32 max; if (max_len == 0) return; /* Decay the counter by 1 average sample. */ running_len = __this_cpu_read(running_sample_length); running_len -= running_len/NR_ACCUMULATED_SAMPLES; running_len += sample_len_ns; __this_cpu_write(running_sample_length, running_len); /* * Note: this will be biased artifically low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ avg_len = running_len/NR_ACCUMULATED_SAMPLES; if (avg_len <= max_len) return; __report_avg = avg_len; __report_allowed = max_len; /* * Compute a throttle threshold 25% below the current duration. */ avg_len += avg_len / 4; max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; if (avg_len < max) max /= (u32)avg_len; else max = 1; WRITE_ONCE(perf_sample_allowed_ns, avg_len); WRITE_ONCE(max_samples_per_tick, max); sysctl_perf_event_sample_rate = max * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } static atomic64_t perf_event_id; static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task); static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) { return "pmu"; } static inline u64 perf_clock(void) { return local_clock(); } static inline u64 perf_event_clock(struct perf_event *event) { return event->clock(); } /* * State based event timekeeping... * * The basic idea is to use event->state to determine which (if any) time * fields to increment with the current delta. This means we only need to * update timestamps when we change state or when they are explicitly requested * (read). * * Event groups make things a little more complicated, but not terribly so. The * rules for a group are that if the group leader is OFF the entire group is * OFF, irrespecive of what the group member states are. This results in * __perf_effective_state(). * * A futher ramification is that when a group leader flips between OFF and * !OFF, we need to update all group member times. * * * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we * need to make sure the relevant context time is updated before we try and * update our timestamps. */ static __always_inline enum perf_event_state __perf_effective_state(struct perf_event *event) { struct perf_event *leader = event->group_leader; if (leader->state <= PERF_EVENT_STATE_OFF) return leader->state; return event->state; } static __always_inline void __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) { enum perf_event_state state = __perf_effective_state(event); u64 delta = now - event->tstamp; *enabled = event->total_time_enabled; if (state >= PERF_EVENT_STATE_INACTIVE) *enabled += delta; *running = event->total_time_running; if (state >= PERF_EVENT_STATE_ACTIVE) *running += delta; } static void perf_event_update_time(struct perf_event *event) { u64 now = perf_event_time(event); __perf_update_times(event, now, &event->total_time_enabled, &event->total_time_running); event->tstamp = now; } static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; for_each_sibling_event(sibling, leader) perf_event_update_time(sibling); } static void perf_event_set_state(struct perf_event *event, enum perf_event_state state) { if (event->state == state) return; perf_event_update_time(event); /* * If a group leader gets enabled/disabled all its siblings * are affected too. */ if ((event->state < 0) ^ (state < 0)) perf_event_update_sibling_time(event); WRITE_ONCE(event->state, state); } /* * UP store-release, load-acquire */ #define __store_release(ptr, val) \ do { \ barrier(); \ WRITE_ONCE(*(ptr), (val)); \ } while (0) #define __load_acquire(ptr) \ ({ \ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ barrier(); \ ___p; \ }) #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* @event doesn't care about cgroup */ if (!event->cgrp) return true; /* wants specific cgroup scope but @cpuctx isn't associated with any */ if (!cpuctx->cgrp) return false; /* * Cgroup scoping is recursive. An event enabled for a cgroup is * also enabled for all its descendant cgroups. If @cpuctx's * cgroup is a descendant of @event's (the test covers identity * case), it's a match. */ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, event->cgrp->css.cgroup); } static inline void perf_detach_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); event->cgrp = NULL; } static inline int is_cgroup_event(struct perf_event *event) { return event->cgrp != NULL; } static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); return t->time; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) return t->time; now += READ_ONCE(t->timeoffset); return now; } static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) { if (adv) info->time += now - info->timestamp; info->timestamp = now; /* * see update_context_time() */ WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; struct perf_cgroup_info *info; if (cgrp) { u64 now = perf_clock(); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, now, true); if (final) __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { struct perf_cgroup_info *info; struct perf_cgroup *cgrp; /* * ensure we access cgroup data only when needed and * when we know the cgroup is pinned (css_get) */ if (!is_cgroup_event(event)) return; cgrp = perf_cgroup_from_task(current, event->ctx); /* * Do not update time when cgroup is not active */ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) { info = this_cpu_ptr(event->cgrp->info); __update_cgrp_time(info, perf_clock(), true); } } static inline void perf_cgroup_set_timestamp(struct task_struct *task, struct perf_event_context *ctx) { struct perf_cgroup *cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; /* * ctx->lock held by caller * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ if (!task || !ctx->nr_cgroups) return; cgrp = perf_cgroup_from_task(task, ctx); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, ctx->timestamp, false); __store_release(&info->active, 1); } } static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ /* * reschedule events based on the cgroup constraint of task. * * mode SWOUT : schedule out everything * mode SWIN : schedule in based on cgroup for next */ static void perf_cgroup_switch(struct task_struct *task, int mode) { struct perf_cpu_context *cpuctx, *tmp; struct list_head *list; unsigned long flags; /* * Disable interrupts and preemption to avoid this CPU's * cgrp_cpuctx_entry to change under us. */ local_irq_save(flags); list = this_cpu_ptr(&cgrp_cpuctx_list); list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); if (mode & PERF_CGROUP_SWOUT) { cpu_ctx_sched_out(cpuctx, EVENT_ALL); /* * must not be done before ctxswout due * to event_filter_match() in event_sched_out() */ cpuctx->cgrp = NULL; } if (mode & PERF_CGROUP_SWIN) { WARN_ON_ONCE(cpuctx->cgrp); /* * set cgrp before ctxsw in to allow * event_filter_match() to not have to pass * task around * we pass the cpuctx->ctx to perf_cgroup_from_task() * because cgorup events are only per-cpu */ cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } local_irq_restore(flags); } static inline void perf_cgroup_sched_out(struct task_struct *task, struct task_struct *next) { struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 * we do not need to pass the ctx here because we know * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know * that we are switching to a different cgroup. Otherwise, * do no touch the cgroup events. */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWOUT); rcu_read_unlock(); } static inline void perf_cgroup_sched_in(struct task_struct *prev, struct task_struct *task) { struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 * we do not need to pass the ctx here because we know * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); cgrp2 = perf_cgroup_from_task(prev, NULL); /* * only need to schedule in cgroup events if we are changing * cgroup during ctxsw. Cgroup events were not scheduled * out of ctxsw out if that was not the case. */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWIN); rcu_read_unlock(); } static int perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css) { struct perf_cpu_context *cpuctx; struct perf_event **storage; int cpu, heap_size, ret = 0; /* * Allow storage to have sufficent space for an iterator for each * possibly nested cgroup plus an iterator for events with no cgroup. */ for (heap_size = 1; css; css = css->parent) heap_size++; for_each_possible_cpu(cpu) { cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu); if (heap_size <= cpuctx->heap_size) continue; storage = kmalloc_node(heap_size * sizeof(struct perf_event *), GFP_KERNEL, cpu_to_node(cpu)); if (!storage) { ret = -ENOMEM; break; } raw_spin_lock_irq(&cpuctx->ctx.lock); if (cpuctx->heap_size < heap_size) { swap(cpuctx->heap, storage); if (storage == cpuctx->heap_default) storage = NULL; cpuctx->heap_size = heap_size; } raw_spin_unlock_irq(&cpuctx->ctx.lock); kfree(storage); } return ret; } static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; struct fd f = fdget(fd); int ret = 0; if (!f.file) return -EBADF; css = css_tryget_online_from_dir(f.file->f_path.dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; } ret = perf_cgroup_ensure_storage(event, css); if (ret) goto out; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; /* * all events in a group must monitor * the same cgroup because a task belongs * to only one perf cgroup at a time */ if (group_leader && group_leader->cgrp != cgrp) { perf_detach_cgroup(event); ret = -EINVAL; } out: fdput(f); return ret; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); /* * Since setting cpuctx->cgrp is conditional on the current @cgrp * matching the event's cgroup, we must do this for every new event, * because if the first would mismatch, the second would not try again * and we would leave cpuctx->cgrp unset. */ if (ctx->is_active && !cpuctx->cgrp) { struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) cpuctx->cgrp = cgrp; } if (ctx->nr_cgroups++) return; list_add(&cpuctx->cgrp_cpuctx_entry, per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (--ctx->nr_cgroups) return; if (ctx->is_active && cpuctx->cgrp) cpuctx->cgrp = NULL; list_del(&cpuctx->cgrp_cpuctx_entry); } #else /* !CONFIG_CGROUP_PERF */ static inline bool perf_cgroup_match(struct perf_event *event) { return true; } static inline void perf_detach_cgroup(struct perf_event *event) {} static inline int is_cgroup_event(struct perf_event *event) { return 0; } static inline void update_cgrp_time_from_event(struct perf_event *event) { } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { } static inline void perf_cgroup_sched_out(struct task_struct *task, struct task_struct *next) { } static inline void perf_cgroup_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { return -EINVAL; } static inline void perf_cgroup_set_timestamp(struct task_struct *task, struct perf_event_context *ctx) { } static inline void perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } static inline u64 perf_cgroup_event_time(struct perf_event *event) { return 0; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } #endif /* * set default to be dependent on timer tick just * like original code */ #define PERF_CPU_HRTIMER (1000 / HZ) /* * function must be called with interrupts disabled */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_context *cpuctx; bool rotations; lockdep_assert_irqs_disabled(); cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); rotations = perf_rotate_context(cpuctx); raw_spin_lock(&cpuctx->hrtimer_lock); if (rotations) hrtimer_forward_now(hr, cpuctx->hrtimer_interval); else cpuctx->hrtimer_active = 0; raw_spin_unlock(&cpuctx->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) { struct hrtimer *timer = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; u64 interval; /* no multiplexing needed for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) return; /* * check default is sane, if not set then force to * default interval (1/tick) */ interval = pmu->hrtimer_interval_ms; if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpuctx->hrtimer_lock); hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) { struct hrtimer *timer = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; unsigned long flags; /* not for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) return 0; raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); if (!cpuctx->hrtimer_active) { cpuctx->hrtimer_active = 1; hrtimer_forward_now(timer, cpuctx->hrtimer_interval); hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); return 0; } static int perf_mux_hrtimer_restart_ipi(void *arg) { return perf_mux_hrtimer_restart(arg); } void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); if (!--(*count)) pmu->pmu_enable(pmu); } static DEFINE_PER_CPU(struct list_head, active_ctx_list); /* * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and * perf_event_task_tick() are fully serialized because they're strictly cpu * affine and perf_event_ctx{activate,deactivate} are called with IRQs * disabled, while perf_event_task_tick is called from IRQ context. */ static void perf_event_ctx_activate(struct perf_event_context *ctx) { struct list_head *head = this_cpu_ptr(&active_ctx_list); lockdep_assert_irqs_disabled(); WARN_ON(!list_empty(&ctx->active_ctx_list)); list_add(&ctx->active_ctx_list, head); } static void perf_event_ctx_deactivate(struct perf_event_context *ctx) { lockdep_assert_irqs_disabled(); WARN_ON(list_empty(&ctx->active_ctx_list)); list_del_init(&ctx->active_ctx_list); } static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); } static void *alloc_task_ctx_data(struct pmu *pmu) { if (pmu->task_ctx_cache) return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); return NULL; } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { if (pmu->task_ctx_cache && task_ctx_data) kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); } static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } static void put_ctx(struct perf_event_context *ctx) { if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } } /* * Because of perf_event::ctx migration in sys_perf_event_open::move_group and * perf_pmu_migrate_context() we need some magic. * * Those places that change perf_event::ctx will hold both * perf_event_ctx::mutex of the 'old' and 'new' ctx value. * * Lock ordering is by mutex address. There are two other sites where * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] * perf_event_exit_event() * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() * inherit_group() * inherit_event() * perf_event_alloc() * perf_init_event() * perf_try_init_event() [ child , 1 ] * * While it appears there is an obvious deadlock here -- the parent and child * nesting levels are inverted between the two. This is in fact safe because * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * * The change in perf_event::ctx does not affect children (as claimed above) * because the sys_perf_event_open() case will install a new event and break * the ctx parent<->child relation, and perf_pmu_migrate_context() is only * concerned with cpuctx and that doesn't have children. * * The places that change perf_event::ctx will issue: * * perf_remove_from_context(); * synchronize_rcu(); * perf_install_in_context(); * * to affect the change. The remove_from_context() + synchronize_rcu() should * quiesce the event, after which we can install it in the new location. This * means that only external vectors (perf_fops, prctl) can perturb the event * while in transit. Therefore all such accessors should also acquire * perf_event_context::mutex to serialize against this. * * However; because event->ctx can change while we're waiting to acquire * ctx->mutex we must be careful and use the below perf_event_ctx_lock() * function. * * Lock order: * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock * perf_event::mmap_mutex * mmap_lock * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) { struct perf_event_context *ctx; again: rcu_read_lock(); ctx = READ_ONCE(event->ctx); if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); mutex_lock_nested(&ctx->mutex, nesting); if (event->ctx != ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); goto again; } return ctx; } static inline struct perf_event_context * perf_event_ctx_lock(struct perf_event *event) { return perf_event_ctx_lock_nested(event, 0); } static void perf_event_ctx_unlock(struct perf_event *event, struct perf_event_context *ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); } /* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up * calling scheduler related locks and ctx->lock nests inside those. */ static __must_check struct perf_event_context * unclone_ctx(struct perf_event_context *ctx) { struct perf_event_context *parent_ctx = ctx->parent_ctx; lockdep_assert_held(&ctx->lock); if (parent_ctx) ctx->parent_ctx = NULL; ctx->generation++; return parent_ctx; } static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, enum pid_type type) { u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; nr = __task_pid_nr_ns(p, type, event->ns); /* avoid -1 if it is idle thread or runs in another ns */ if (!nr && !pid_alive(p)) nr = -1; return nr; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_TGID); } static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_PID); } /* * If we inherit events we want to return the parent event id * to userspace. */ static u64 primary_event_id(struct perf_event *event) { u64 id = event->id; if (event->parent) id = event->parent->id; return id; } /* * Get the perf_event_context for a task and lock it. * * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) { struct perf_event_context *ctx; retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read * side critical section has interrupts disabled. */ local_irq_save(*flags); rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { /* * If this context is a clone of another, it might * get swapped for another underneath us by * perf_event_task_sched_out, though the * rcu_read_lock() protects us from any context * getting freed. Lock the context and check if it * got swapped before we could get the lock, and retry * if so. If we locked the right context, then it * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); goto retry; } if (ctx->task == TASK_TOMBSTONE || !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { WARN_ON_ONCE(ctx->task != task); } } rcu_read_unlock(); if (!ctx) local_irq_restore(*flags); return ctx; } /* * Get the context for a task and increment its pin_count so it * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ static struct perf_event_context * perf_pin_task_context(struct task_struct *task, int ctxn) { struct perf_event_context *ctx; unsigned long flags; ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; } static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } /* * Update the record of the current time in a context. */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; /* * The above: time' = time + (now - timestamp), can be re-arranged * into: time` = now + (time - timestamp), which gives a single value * offset to compute future time without locks on. * * See perf_event_time_now(), which can be used from NMI context where * it's (obviously) not possible to acquire ctx->lock in order to read * both the above values in a consistent manner. */ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time(event); return ctx->time; } static u64 perf_event_time_now(struct perf_event *event, u64 now) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) return ctx->time; now += READ_ONCE(ctx->timeoffset); return now; } static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; enum event_type_t event_type; lockdep_assert_held(&ctx->lock); /* * It's 'group type', really, because if our group leader is * pinned, so are we. */ if (event->group_leader != event) event = event->group_leader; event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; return event_type; } /* * Helper function to initialize event group nodes. */ static void init_event_group(struct perf_event *event) { RB_CLEAR_NODE(&event->group_node); event->group_index = 0; } /* * Extract pinned or flexible groups from the context * based on event attrs bits. */ static struct perf_event_groups * get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; else return &ctx->flexible_groups; } /* * Helper function to initializes perf_event_group trees. */ static void perf_event_groups_init(struct perf_event_groups *groups) { groups->tree = RB_ROOT; groups->index = 0; } /* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU. */ static bool perf_event_groups_less(struct perf_event *left, struct perf_event *right) { if (left->cpu < right->cpu) return true; if (left->cpu > right->cpu) return false; #ifdef CONFIG_CGROUP_PERF if (left->cgrp != right->cgrp) { if (!left->cgrp || !left->cgrp->css.cgroup) { /* * Left has no cgroup but right does, no cgroups come * first. */ return true; } if (!right->cgrp || !right->cgrp->css.cgroup) { /* * Right has no cgroup but left does, no cgroups come * first. */ return false; } /* Two dissimilar cgroups, order by id. */ if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id) return true; return false; } #endif if (left->group_index < right->group_index) return true; if (left->group_index > right->group_index) return false; return false; } /* * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for * key (see perf_event_groups_less). This places it last inside the CPU * subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, struct perf_event *event) { struct perf_event *node_event; struct rb_node *parent; struct rb_node **node; event->group_index = ++groups->index; node = &groups->tree.rb_node; parent = *node; while (*node) { parent = *node; node_event = container_of(*node, struct perf_event, group_node); if (perf_event_groups_less(event, node_event)) node = &parent->rb_left; else node = &parent->rb_right; } rb_link_node(&event->group_node, parent, node); rb_insert_color(&event->group_node, &groups->tree); } /* * Helper function to insert event into the pinned or flexible groups. */ static void add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_insert(groups, event); } /* * Delete a group from a tree. */ static void perf_event_groups_delete(struct perf_event_groups *groups, struct perf_event *event) { WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || RB_EMPTY_ROOT(&groups->tree)); rb_erase(&event->group_node, &groups->tree); init_event_group(event); } /* * Helper function to delete event from its groups. */ static void del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_delete(groups, event); } /* * Get the leftmost event in the cpu/cgroup subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct cgroup *cgrp) { struct perf_event *node_event = NULL, *match = NULL; struct rb_node *node = groups->tree.rb_node; #ifdef CONFIG_CGROUP_PERF u64 node_cgrp_id, cgrp_id = 0; if (cgrp) cgrp_id = cgrp->kn->id; #endif while (node) { node_event = container_of(node, struct perf_event, group_node); if (cpu < node_event->cpu) { node = node->rb_left; continue; } if (cpu > node_event->cpu) { node = node->rb_right; continue; } #ifdef CONFIG_CGROUP_PERF node_cgrp_id = 0; if (node_event->cgrp && node_event->cgrp->css.cgroup) node_cgrp_id = node_event->cgrp->css.cgroup->kn->id; if (cgrp_id < node_cgrp_id) { node = node->rb_left; continue; } if (cgrp_id > node_cgrp_id) { node = node->rb_right; continue; } #endif match = node_event; node = node->rb_left; } return match; } /* * Like rb_entry_next_safe() for the @cpu subtree. */ static struct perf_event * perf_event_groups_next(struct perf_event *event) { struct perf_event *next; #ifdef CONFIG_CGROUP_PERF u64 curr_cgrp_id = 0; u64 next_cgrp_id = 0; #endif next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); if (next == NULL || next->cpu != event->cpu) return NULL; #ifdef CONFIG_CGROUP_PERF if (event->cgrp && event->cgrp->css.cgroup) curr_cgrp_id = event->cgrp->css.cgroup->kn->id; if (next->cgrp && next->cgrp->css.cgroup) next_cgrp_id = next->cgrp->css.cgroup->kn->id; if (curr_cgrp_id != next_cgrp_id) return NULL; #endif return next; } /* * Iterate through the whole groups tree. */ #define perf_event_groups_for_each(event, groups) \ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ typeof(*event), group_node); event; \ event = rb_entry_safe(rb_next(&event->group_node), \ typeof(*event), group_node)) /* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; event->tstamp = perf_event_time(event); /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { event->group_caps = event->event_caps; add_event_to_groups(event, ctx); } list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); ctx->generation++; } /* * Initialize event state based on the perf_event_attr::disabled. */ static inline void perf_event__state_init(struct perf_event *event) { event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : PERF_EVENT_STATE_INACTIVE; } static int __perf_event_read_size(u64 read_format, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); if (read_format & PERF_FORMAT_ID) entry += sizeof(u64); if (read_format & PERF_FORMAT_LOST) entry += sizeof(u64); if (read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } /* * Since perf_event_validate_size() limits this to 16k and inhibits * adding more siblings, this will never overflow. */ return size + nr * entry; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; u16 size = 0; if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); if (sample_type & PERF_SAMPLE_ADDR) size += sizeof(data->addr); if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); if (sample_type & PERF_SAMPLE_WEIGHT) size += sizeof(data->weight); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); if (sample_type & PERF_SAMPLE_PHYS_ADDR) size += sizeof(data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); event->header_size = size; } /* * Called at perf_event creation and when events are attached/detached from a * group. */ static void perf_event__header_size(struct perf_event *event) { event->read_size = __perf_event_read_size(event->attr.read_format, event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; u64 sample_type = event->attr.sample_type; u16 size = 0; if (sample_type & PERF_SAMPLE_TID) size += sizeof(data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) size += sizeof(data->time); if (sample_type & PERF_SAMPLE_IDENTIFIER) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_ID) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) size += sizeof(data->stream_id); if (sample_type & PERF_SAMPLE_CPU) size += sizeof(data->cpu_entry); event->id_header_size = size; } /* * Check that adding an event to the group does not result in anybody * overflowing the 64k event limit imposed by the output buffer. * * Specifically, check that the read_size for the event does not exceed 16k, * read_size being the one term that grows with groups size. Since read_size * depends on per-event read_format, also (re)check the existing events. * * This leaves 48k for the constant size fields and things like callchains, * branch stacks and register sets. */ static bool perf_event_validate_size(struct perf_event *event) { struct perf_event *sibling, *group_leader = event->group_leader; if (__perf_event_read_size(event->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; if (__perf_event_read_size(group_leader->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; /* * When creating a new group leader, group_leader->ctx is initialized * after the size has been validated, but we cannot safely use * for_each_sibling_event() until group_leader->ctx is set. A new group * leader cannot have any siblings yet, so we can safely skip checking * the non-existent siblings. */ if (event == group_leader) return true; for_each_sibling_event(sibling, group_leader) { if (__perf_event_read_size(sibling->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; } return true; } static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; lockdep_assert_held(&event->ctx->lock); /* * We can have double attach due to group movement in perf_event_open. */ if (event->attach_state & PERF_ATTACH_GROUP) return; event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) return; WARN_ON_ONCE(group_leader->ctx != event->ctx); group_leader->group_caps &= event->event_caps; list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; group_leader->group_generation++; perf_event__header_size(group_leader); for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } /* * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; event->attach_state &= ~PERF_ATTACH_CONTEXT; ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; list_del_rcu(&event->event_entry); if (event->group_leader == event) del_event_from_groups(event, ctx); /* * If event was in error state, then keep it * that way, otherwise bogus counts will be * returned on read(). The only way to get out * of error state is by explicit re-enabling * of the event */ if (event->state > PERF_EVENT_STATE_OFF) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); } ctx->generation++; } static int perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) { if (!has_aux(aux_event)) return 0; if (!event->pmu->aux_output_match) return 0; return event->pmu->aux_output_match(aux_event); } static void put_event(struct perf_event *event); static void event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx); static void perf_put_aux_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *iter; /* * If event uses aux_event tear down the link */ if (event->aux_event) { iter = event->aux_event; event->aux_event = NULL; put_event(iter); return; } /* * If the event is an aux_event, tear down all links to * it from other events. */ for_each_sibling_event(iter, event->group_leader) { if (iter->aux_event != event) continue; iter->aux_event = NULL; put_event(event); /* * If it's ACTIVE, schedule it out and put it into ERROR * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ event_sched_out(iter, cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } static bool perf_need_aux_event(struct perf_event *event) { return !!event->attr.aux_output || !!event->attr.aux_sample_size; } static int perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader) { /* * Our group leader must be an aux event if we want to be * an aux_output. This way, the aux event will precede its * aux_output events in the group, and therefore will always * schedule first. */ if (!group_leader) return 0; /* * aux_output and aux_sample_size are mutually exclusive. */ if (event->attr.aux_output && event->attr.aux_sample_size) return 0; if (event->attr.aux_output && !perf_aux_output_match(event, group_leader)) return 0; if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; if (!atomic_long_inc_not_zero(&group_leader->refcount)) return 0; /* * Link aux_outputs to their aux event; this is undone in * perf_group_detach() by perf_put_aux_event(). When the * group in torn down, the aux_output events loose their * link to the aux_event and can't schedule any more. */ event->aux_event = group_leader; return 1; } static inline struct list_head *get_event_list(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; } /* * Events that have PERF_EV_CAP_SIBLING require being part of a group and * cannot exist on their own, schedule them out and move them into the ERROR * state. Also see _perf_event_enable(), it will not be able to recover * this ERROR state. */ static inline void perf_remove_sibling_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); event_sched_out(event, cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_GROUP)) return; event->attach_state &= ~PERF_ATTACH_GROUP; perf_put_aux_event(event); /* * If this is a sibling, remove it from its group. */ if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; event->group_leader->group_generation++; goto out; } /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { if (sibling->event_caps & PERF_EV_CAP_SIBLING) perf_remove_sibling_event(sibling); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; if (!RB_EMPTY_NODE(&event->group_node)) { add_event_to_groups(sibling, event->ctx); if (sibling->state == PERF_EVENT_STATE_ACTIVE) list_add_tail(&sibling->active_list, get_event_list(sibling)); } WARN_ON_ONCE(sibling->ctx != event->ctx); } out: for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); perf_event__header_size(leader); } static bool is_orphaned_event(struct perf_event *event) { return event->state == PERF_EVENT_STATE_DEAD; } static inline int __pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; return pmu->filter_match ? pmu->filter_match(event) : 1; } /* * Check whether we should attempt to schedule an event group based on * PMU-specific filtering. An event group can consist of HW and SW events, * potentially with a SW leader, so we must check all the filters, to * determine whether a group is schedulable: */ static inline int pmu_filter_match(struct perf_event *event) { struct perf_event *sibling; if (!__pmu_filter_match(event)) return 0; for_each_sibling_event(sibling, event) { if (!__pmu_filter_match(sibling)) return 0; } return 1; } static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && perf_cgroup_match(event) && pmu_filter_match(event); } static void event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state != PERF_EVENT_STATE_ACTIVE) return; /* * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but * we can schedule events _OUT_ individually through things like * __perf_remove_from_context(). */ list_del_init(&event->active_list); perf_pmu_disable(event->pmu); event->pmu->del(event, 0); event->oncpu = -1; if (READ_ONCE(event->pending_disable) >= 0) { WRITE_ONCE(event->pending_disable, -1); perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } perf_event_set_state(event, state); if (!is_software_event(event)) cpuctx->active_oncpu--; if (!--ctx->nr_active) perf_event_ctx_deactivate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; perf_pmu_enable(event->pmu); } static void group_sched_out(struct perf_event *group_event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { struct perf_event *event; if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; perf_pmu_disable(ctx->pmu); event_sched_out(group_event, cpuctx, ctx); /* * Schedule out siblings (if any): */ for_each_sibling_event(event, group_event) event_sched_out(event, cpuctx, ctx); perf_pmu_enable(ctx->pmu); } #define DETACH_GROUP 0x01UL /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ static void __perf_remove_from_context(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { unsigned long flags = (unsigned long)info; if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx, false); } event_sched_out(event, cpuctx, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); if (!ctx->nr_events && ctx->is_active) { if (ctx == &cpuctx->ctx) update_cgrp_time_from_cpuctx(cpuctx, true); ctx->is_active = 0; ctx->rotate_necessary = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; } } } /* * Remove the event from a task's (or a CPU's) list of events. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since * that only calls us on the top-level context, which can't be a clone. * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->mutex); event_function_call(event, __perf_remove_from_context, (void *)flags); /* * The above event_function_call() can NO-OP when it hits * TASK_TOMBSTONE. In that case we must already have been detached * from the context (by perf_event_exit_event()) but the grouping * might still be in-tact. */ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); if ((flags & DETACH_GROUP) && (event->attach_state & PERF_ATTACH_GROUP)) { /* * Since in that case we cannot possibly be scheduled, simply * detach now. */ raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); raw_spin_unlock_irq(&ctx->lock); } } /* * Cross CPU call to disable a performance event */ static void __perf_event_disable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { if (event->state < PERF_EVENT_STATE_INACTIVE) return; if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); else event_sched_out(event, cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); perf_cgroup_event_disable(event, ctx); } /* * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * * When called from perf_pending_event it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) { raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_disable, NULL); } void perf_event_disable_local(struct perf_event *event) { event_function_local(event, __perf_event_disable, NULL); } /* * Strictly speaking kernel users cannot create groups and therefore this * interface does not need the perf_event_ctx_lock() magic. */ void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_disable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { WRITE_ONCE(event->pending_disable, smp_processor_id()); /* can fail, see perf_pending_event_disable() */ irq_work_queue(&event->pending); } #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); static int event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { int ret = 0; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) return 0; WRITE_ONCE(event->oncpu, smp_processor_id()); /* * Order event::oncpu write to happen before the ACTIVE state is * visible. This allows perf_event_{stop,read}() to observe the correct * ->oncpu if it sees ACTIVE. */ smp_wmb(); perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { perf_log_throttle(event, 1); event->hw.interrupts = 0; } perf_pmu_disable(event->pmu); perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) perf_event_ctx_activate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq++; if (event->attr.exclusive) cpuctx->exclusive = 1; out: perf_pmu_enable(event->pmu); return ret; } static int group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, cpuctx, ctx)) goto error; /* * Schedule in siblings as one group (if any): */ for_each_sibling_event(event, group_event) { if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; } } if (!pmu->commit_txn(pmu)) return 0; group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: * The events up to the failed event are scheduled out normally. */ for_each_sibling_event(event, group_event) { if (event == partial_group) break; event_sched_out(event, cpuctx, ctx); } event_sched_out(group_event, cpuctx, ctx); error: pmu->cancel_txn(pmu); return -EAGAIN; } /* * Work out whether we can put this event group on the CPU now. */ static int group_can_go_on(struct perf_event *event, struct perf_cpu_context *cpuctx, int can_add_hw) { /* * Groups consisting entirely of software events can always go on. */ if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware * events can go on. */ if (cpuctx->exclusive) return 0; /* * If this group is exclusive and there are already * events on the CPU, it can't go on. */ if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able * to go on. */ return can_add_hw; } static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); perf_group_attach(event); } static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task); static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, enum event_type_t event_type) { if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; ctx_sched_out(ctx, cpuctx, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct task_struct *task) { cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); if (ctx) ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); if (ctx) ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); } /* * We want to maintain the following priority of scheduling: * - CPU pinned (EVENT_CPU | EVENT_PINNED) * - task pinned (EVENT_PINNED) * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) * - task flexible (EVENT_FLEXIBLE). * * In order to avoid unscheduling and scheduling back in everything every * time an event is added, only do it for the groups of equal priority and * below. * * This can be called after a batch operation on task events, in which case * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, enum event_type_t event_type) { enum event_type_t ctx_event_type; bool cpu_event = !!(event_type & EVENT_CPU); /* * If pinned groups are involved, flexible groups also need to be * scheduled out. */ if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; ctx_event_type = event_type & EVENT_ALL; perf_pmu_disable(cpuctx->ctx.pmu); if (task_ctx) task_ctx_sched_out(cpuctx, task_ctx, event_type); /* * Decide which cpu ctx groups to schedule out based on the types * of events that caused rescheduling: * - EVENT_CPU: schedule out corresponding groups; * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; * - otherwise, do nothing more. */ if (cpu_event) cpu_ctx_sched_out(cpuctx, ctx_event_type); else if (ctx_event_type & EVENT_PINNED) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, task_ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); } void perf_pmu_resched(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } /* * Cross CPU call to install and enable a performance event * * Very similar to remote_function() + event_function() but cannot assume that * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { raw_spin_lock(&ctx->lock); task_ctx = ctx; reprogram = (ctx->task == current); /* * If the task is running, it must be running on this CPU, * otherwise we cannot reprogram things. * * If its not running, we don't care, ctx->lock will * serialize against it becoming runnable. */ if (task_curr(ctx->task) && !reprogram) { ret = -ESRCH; goto unlock; } WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } #ifdef CONFIG_CGROUP_PERF if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { /* * If the current cgroup doesn't match the event's * cgroup, we should not try to schedule it. */ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); reprogram = cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup); } #endif if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, get_event_type(event)); } else { add_event_to_ctx(event, ctx); } unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx); /* * Attach a performance event to a context. * * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) event->cpu = cpu; /* * Ensures that if we can observe event->ctx, both the event and ctx * will be 'complete'. See perf_iterate_sb_cpu(). */ smp_store_release(&event->ctx, ctx); /* * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); return; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); return; } if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; } /* * Should not happen, we validate the ctx is still alive before calling. */ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) return; /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. * * Instead we use task_curr(), which tells us if the task is running. * However, since we use task_curr() outside of rq::lock, we can race * against the actual state. This means the result can be wrong. * * If we get a false positive, we retry, this is harmless. * * If we get a false negative, things are complicated. If we are after * perf_event_context_sched_in() ctx::lock will serialize us, and the * value must be correct. If we're before, it doesn't matter since * perf_event_context_sched_in() will program the counter. * * However, this hinges on the remote context switch having observed * our task->perf_event_ctxp[] store, such that it will in fact take * ctx::lock in perf_event_context_sched_in(). * * We do this by task_function_call(), if the IPI fails to hit the task * we know any future context switch of task must see the * perf_event_ctpx[] store. */ /* * This smp_mb() orders the task->perf_event_ctxp[] store with the * task_cpu() load, such that if the IPI then does not find the task * running, a future context switch of that task must observe the * store. */ smp_mb(); again: if (!task_function_call(task, __perf_install_in_context, event)) return; raw_spin_lock_irq(&ctx->lock); task = ctx->task; if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { /* * Cannot happen because we already checked above (which also * cannot happen), and we hold ctx->mutex, which serializes us * against perf_event_exit_task_context(). */ raw_spin_unlock_irq(&ctx->lock); return; } /* * If the task is not running, ctx->lock will avoid it becoming so, * thus we can safely install the event. */ if (task_curr(task)) { raw_spin_unlock_irq(&ctx->lock); goto again; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to enable a performance event */ static void __perf_event_enable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event *leader = event->group_leader; struct perf_event_context *task_ctx; if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state <= PERF_EVENT_STATE_ERROR) return; if (ctx->is_active) ctx_sched_out(ctx, cpuctx, EVENT_TIME); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); if (!ctx->is_active) return; if (!event_filter_match(event)) { ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; } task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); ctx_resched(cpuctx, task_ctx, get_event_type(event)); } /* * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { out: raw_spin_unlock_irq(&ctx->lock); return; } /* * If the event is in error state, clear that first. * * That way, if we see the event in error state below, we know that it * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ if (event->state == PERF_EVENT_STATE_ERROR) { /* * Detached SIBLING events cannot leave ERROR state. */ if (event->event_caps & PERF_EV_CAP_SIBLING && event->group_leader == event) goto out; event->state = PERF_EVENT_STATE_OFF; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); } /* * See perf_event_disable(); */ void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_enable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_enable); struct stop_event_data { struct perf_event *event; unsigned int restart; }; static int __perf_event_stop(void *info) { struct stop_event_data *sd = info; struct perf_event *event = sd->event; /* if it's already INACTIVE, do nothing */ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * There is a window with interrupts enabled before we get here, * so we need to check again lest we try to stop another CPU's event. */ if (READ_ONCE(event->oncpu) != smp_processor_id()) return -EAGAIN; event->pmu->stop(event, PERF_EF_UPDATE); /* * May race with the actual stop (through perf_pmu_output_stop()), * but it is only used for events with AUX ring buffer, and such * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * * Since this is happening on an event-local CPU, no trace is lost * while restarting. */ if (sd->restart) event->pmu->start(event, 0); return 0; } static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, .restart = restart, }; int ret = 0; do { if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * We only want to restart ACTIVE events, so if the event goes * inactive here (event->oncpu==-1), there's nothing more to do; * fall through with ret==-ENXIO. */ ret = cpu_function_call(READ_ONCE(event->oncpu), __perf_event_stop, &sd); } while (ret == -EAGAIN); return ret; } /* * In order to contain the amount of racy and tricky in the address filter * configuration management, it is a two part process: * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. * * If (p1) happens while the event is active, we restart it to force (p2). * * (1) perf_addr_filters_apply(): adjusting filters' offsets based on * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. */ void perf_event_addr_filters_sync(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); if (!has_addr_filter(event)) return; raw_spin_lock(&ifh->lock); if (event->addr_filters_gen != event->hw.addr_filters_gen) { event->pmu->addr_filters_sync(event); event->hw.addr_filters_gen = event->addr_filters_gen; } raw_spin_unlock(&ifh->lock); } EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events */ if (event->attr.inherit || !is_sampling_event(event)) return -EINVAL; atomic_add(refresh, &event->event_limit); _perf_event_enable(event); return 0; } /* * See perf_event_disable() */ int perf_event_refresh(struct perf_event *event, int refresh) { struct perf_event_context *ctx; int ret; ctx = perf_event_ctx_lock(event); ret = _perf_event_refresh(event, refresh); perf_event_ctx_unlock(event, ctx); return ret; } EXPORT_SYMBOL_GPL(perf_event_refresh); static int perf_event_modify_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { int err; _perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, true); if (!bp->attr.disabled) _perf_event_enable(bp); return err; } static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { if (event->attr.type != attr->type) return -EINVAL; switch (event->attr.type) { case PERF_TYPE_BREAKPOINT: return perf_event_modify_breakpoint(event, attr); default: /* Place holder for future additions. */ return -EOPNOTSUPP; } } static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { struct perf_event *event, *tmp; int is_active = ctx->is_active; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) { /* * See __perf_remove_from_context(). */ WARN_ON_ONCE(ctx->is_active); if (ctx->task) WARN_ON_ONCE(cpuctx->task_ctx); return; } /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last * context we sched out. For example: * * ctx_sched_out(.event_type = EVENT_FLEXIBLE) * ctx_sched_out(.event_type = EVENT_PINNED) * * would only update time for the pinned events. */ if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); } ctx->is_active &= ~event_type; if (!(ctx->is_active & EVENT_ALL)) ctx->is_active = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); if (!ctx->is_active) cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) group_sched_out(event, cpuctx, ctx); } if (is_active & EVENT_FLEXIBLE) { list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) group_sched_out(event, cpuctx, ctx); /* * Since we cleared EVENT_FLEXIBLE, also clear * rotate_necessary, is will be reset by * ctx_flexible_sched_in() when needed. */ ctx->rotate_necessary = 0; } perf_pmu_enable(ctx->pmu); } /* * Test whether two contexts are equivalent, i.e. whether they have both been * cloned from the same version of the same context. * * Equivalence is measured using a generation number in the context that is * incremented on each modification to it; see unclone_ctx(), list_add_event() * and list_del_event(). */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { lockdep_assert_held(&ctx1->lock); lockdep_assert_held(&ctx2->lock); /* Pinning disables the swap optimization */ if (ctx1->pin_count || ctx2->pin_count) return 0; /* If ctx1 is the parent of ctx2 */ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) return 1; /* If ctx2 is the parent of ctx1 */ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) return 1; /* * If ctx1 and ctx2 have the same parent; we flatten the parent * hierarchy, see perf_event_init_context(). */ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && ctx1->parent_gen == ctx2->parent_gen) return 1; /* Unmatched */ return 0; } static void __perf_event_sync_stat(struct perf_event *event, struct perf_event *next_event) { u64 value; if (!event->attr.inherit_stat) return; /* * Update the event value, we cannot use perf_event_read() * because we're in the middle of a context switch and have IRQs * disabled, which upsets smp_call_function_single(), however * we know the event must be on the current CPU, therefore we * don't need to use it. */ if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); perf_event_update_time(event); /* * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ value = local64_read(&next_event->count); value = local64_xchg(&event->count, value); local64_set(&next_event->count, value); swap(event->total_time_enabled, next_event->total_time_enabled); swap(event->total_time_running, next_event->total_time_running); /* * Since we swizzled the values, update the user visible data too. */ perf_event_update_userpage(event); perf_event_update_userpage(next_event); } static void perf_event_sync_stat(struct perf_event_context *ctx, struct perf_event_context *next_ctx) { struct perf_event *event, *next_event; if (!ctx->nr_stat) return; update_context_time(ctx); event = list_first_entry(&ctx->event_list, struct perf_event, event_entry); next_event = list_first_entry(&next_ctx->event_list, struct perf_event, event_entry); while (&event->event_entry != &ctx->event_list && &next_event->event_entry != &next_ctx->event_list) { __perf_event_sync_stat(event, next_event); event = list_next_entry(event, event_entry); next_event = list_next_entry(next_event, event_entry); } } static void perf_event_context_sched_out(struct task_struct *task, int ctxn, struct task_struct *next) { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; struct perf_event_context *parent, *next_parent; struct perf_cpu_context *cpuctx; int do_switch = 1; struct pmu *pmu; if (likely(!ctx)) return; pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); if (!cpuctx->task_ctx) return; rcu_read_lock(); next_ctx = next->perf_event_ctxp[ctxn]; if (!next_ctx) goto unlock; parent = rcu_dereference(ctx->parent_ctx); next_parent = rcu_dereference(next_ctx->parent_ctx); /* If neither context have a parent context; they cannot be clones. */ if (!parent && !next_parent) goto unlock; if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both * contexts and check that they are clones under the * lock (including re-checking that neither has been * uncloned in the meantime). It doesn't matter which * order we take the locks because no other cpu could * be trying to lock both of these tasks. */ raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); perf_pmu_disable(pmu); if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(ctx, false); /* * PMU specific parts of task perf context can require * additional synchronization. As an example of such * synchronization see implementation details of Intel * LBR call stack data profiling; */ if (pmu->swap_task_ctx) pmu->swap_task_ctx(ctx, next_ctx); else swap(ctx->task_ctx_data, next_ctx->task_ctx_data); perf_pmu_enable(pmu); /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of * ctx->task and ctx->task_ctx_data are immaterial * since those values are always verified under * ctx->lock which we're now holding. */ RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); do_switch = 0; perf_event_sync_stat(ctx, next_ctx); } raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } unlock: rcu_read_unlock(); if (do_switch) { raw_spin_lock(&ctx->lock); perf_pmu_disable(pmu); if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(ctx, false); task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); perf_pmu_enable(pmu); raw_spin_unlock(&ctx->lock); } } static DEFINE_PER_CPU(struct list_head, sched_cb_list); void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); this_cpu_dec(perf_sched_cb_usages); if (!--cpuctx->sched_cb_usage) list_del(&cpuctx->sched_cb_entry); } void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); if (!cpuctx->sched_cb_usage++) list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); this_cpu_inc(perf_sched_cb_usages); } /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. * * This callback is relevant even to per-cpu events; for example multi event * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) { struct pmu *pmu; pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); pmu->sched_task(cpuctx->task_ctx, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, bool sched_in) { struct perf_cpu_context *cpuctx; if (prev == next) return; list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { /* will be handled in perf_event_context_sched_in/out */ if (cpuctx->task_ctx) continue; __perf_pmu_sched_task(cpuctx, sched_in); } } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); #define for_each_task_context_nr(ctxn) \ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) /* * Called from scheduler to remove the events of the current task, * with interrupts disabled. * * We stop each event and update the event value in event->count. * * This does not protect us against NMI, but disable() * sets the disabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * not restart the event. */ void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { int ctxn; if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); /* * if cgroup events exist on this CPU, then we need * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_out(task, next); } /* * Called with IRQs disabled */ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type) { ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); } static bool perf_less_group_idx(const void *l, const void *r) { const struct perf_event *le = *(const struct perf_event **)l; const struct perf_event *re = *(const struct perf_event **)r; return le->group_index < re->group_index; } static void swap_ptr(void *l, void *r) { void **lp = l, **rp = r; swap(*lp, *rp); } static const struct min_heap_callbacks perf_min_heap = { .elem_size = sizeof(struct perf_event *), .less = perf_less_group_idx, .swp = swap_ptr, }; static void __heap_add(struct min_heap *heap, struct perf_event *event) { struct perf_event **itrs = heap->data; if (event) { itrs[heap->nr] = event; heap->nr++; } } static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, struct perf_event_groups *groups, int cpu, int (*func)(struct perf_event *, void *), void *data) { #ifdef CONFIG_CGROUP_PERF struct cgroup_subsys_state *css = NULL; #endif /* Space for per CPU and/or any CPU event iterators. */ struct perf_event *itrs[2]; struct min_heap event_heap; struct perf_event **evt; int ret; if (cpuctx) { event_heap = (struct min_heap){ .data = cpuctx->heap, .nr = 0, .size = cpuctx->heap_size, }; lockdep_assert_held(&cpuctx->ctx.lock); #ifdef CONFIG_CGROUP_PERF if (cpuctx->cgrp) css = &cpuctx->cgrp->css; #endif } else { event_heap = (struct min_heap){ .data = itrs, .nr = 0, .size = ARRAY_SIZE(itrs), }; /* Events not within a CPU context may be on any CPU. */ __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL)); } evt = event_heap.data; __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL)); #ifdef CONFIG_CGROUP_PERF for (; css; css = css->parent) __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup)); #endif min_heapify_all(&event_heap, &perf_min_heap); while (event_heap.nr) { ret = func(*evt, data); if (ret) return ret; *evt = perf_event_groups_next(*evt); if (*evt) min_heapify(&event_heap, 0, &perf_min_heap); else min_heap_pop(&event_heap, &perf_min_heap); } return 0; } /* * Because the userpage is strictly per-event (there is no concept of context, * so there cannot be a context indirection), every userpage must be updated * when context time starts :-( * * IOW, we must not miss EVENT_TIME edges. */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); perf_event_update_userpage(event); return true; } static inline void group_update_userpage(struct perf_event *group_event) { struct perf_event *event; if (!event_update_userpage(group_event)) return; for_each_sibling_event(event, group_event) event_update_userpage(event); } static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; if (!event_filter_match(event)) return 0; if (group_can_go_on(event, cpuctx, *can_add_hw)) { if (!group_sched_in(event, cpuctx, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } else { ctx->rotate_necessary = 1; perf_mux_hrtimer_restart(cpuctx); group_update_userpage(event); } } return 0; } static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { int can_add_hw = 1; if (ctx != &cpuctx->ctx) cpuctx = NULL; visit_groups_merge(cpuctx, &ctx->pinned_groups, smp_processor_id(), merge_sched_in, &can_add_hw); } static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { int can_add_hw = 1; if (ctx != &cpuctx->ctx) cpuctx = NULL; visit_groups_merge(cpuctx, &ctx->flexible_groups, smp_processor_id(), merge_sched_in, &can_add_hw); } static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task) { int is_active = ctx->is_active; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; if (!(is_active & EVENT_TIME)) { /* start ctx time */ __update_context_time(ctx, false); perf_cgroup_set_timestamp(task, ctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); } ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); } is_active ^= ctx->is_active; /* changed bits */ /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task) { struct perf_event_context *ctx = &cpuctx->ctx; ctx_sched_in(ctx, cpuctx, event_type, task); } static void perf_event_context_sched_in(struct perf_event_context *ctx, struct task_struct *task) { struct perf_cpu_context *cpuctx; struct pmu *pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) { if (cpuctx->sched_cb_usage) __perf_pmu_sched_task(cpuctx, true); return; } perf_ctx_lock(cpuctx, ctx); /* * We must check ctx->nr_events while holding ctx->lock, such * that we serialize against perf_install_in_context(). */ if (!ctx->nr_events) goto unlock; perf_pmu_disable(pmu); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, * cpu flexible, task flexible. * * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(cpuctx->task_ctx, true); perf_pmu_enable(pmu); unlock: perf_ctx_unlock(cpuctx, ctx); } /* * Called from scheduler to add the events of the current task * with interrupts disabled. * * We restore the event value and then enable it. * * This does not protect us against NMI, but enable() * sets the enabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * keep the event running. */ void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { struct perf_event_context *ctx; int ctxn; /* * If cgroup events exist on this CPU, then we need to check if we have * to switch in PMU state; cgroup event are system-wide mode only. * * Since cgroup events are CPU events, we must schedule these in before * we schedule in the task events. */ if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) continue; perf_event_context_sched_in(ctx, task); } if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) { u64 frequency = event->attr.sample_freq; u64 sec = NSEC_PER_SEC; u64 divisor, dividend; int count_fls, nsec_fls, frequency_fls, sec_fls; count_fls = fls64(count); nsec_fls = fls64(nsec); frequency_fls = fls64(frequency); sec_fls = 30; /* * We got @count in @nsec, with a target of sample_freq HZ * the target period becomes: * * @count * 10^9 * period = ------------------- * @nsec * sample_freq * */ /* * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude. */ #define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \ a >>= 1; \ a##_fls--; \ } else { \ b >>= 1; \ b##_fls--; \ } \ } while (0) /* * Reduce accuracy until either term fits in a u64, then proceed with * the other, so that finally we can do a u64/u64 division. */ while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); REDUCE_FLS(sec, count); } if (count_fls + sec_fls > 64) { divisor = nsec * frequency; while (count_fls + sec_fls > 64) { REDUCE_FLS(count, sec); divisor >>= 1; } dividend = count * sec; } else { dividend = count * sec; while (nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); dividend >>= 1; } divisor = nsec * frequency; } if (!divisor) return dividend; return div64_u64(dividend, divisor); } static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; s64 delta; period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); delta = (delta + 7) / 8; /* low pass filter */ sample_period = hwc->sample_period + delta; if (!sample_period) sample_period = 1; hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { if (disable) event->pmu->stop(event, PERF_EF_UPDATE); local64_set(&hwc->period_left, 0); if (disable) event->pmu->start(event, PERF_EF_RELOAD); } } /* * combine freq adjustment with unthrottling to avoid two passes over the * events. At the same time, make sure, having freq events does not change * the rate of unthrottling as that would introduce bias. */ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, int needs_unthr) { struct perf_event *event; struct hw_perf_event *hwc; u64 now, period = TICK_NSEC; s64 delta; /* * only need to iterate over all events iff: * - context have events in frequency mode (needs freq adjust) * - there are events to unthrottle on this cpu */ if (!(ctx->nr_freq || needs_unthr)) return; raw_spin_lock(&ctx->lock); perf_pmu_disable(ctx->pmu); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; if (!event_filter_match(event)) continue; perf_pmu_disable(event->pmu); hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) goto next; /* * stop the event and update event->count */ event->pmu->stop(event, PERF_EF_UPDATE); now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; /* * restart the event * reload only if value has changed * we have stopped the event so tell that * to perf_adjust_period() to avoid stopping it * twice. */ if (delta > 0) perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); next: perf_pmu_enable(event->pmu); } perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } /* * Move @event to the tail of the @ctx's elegible events. */ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) { /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ if (ctx->rotate_disable) return; perf_event_groups_delete(&ctx->flexible_groups, event); perf_event_groups_insert(&ctx->flexible_groups, event); } /* pick an event from the flexible_groups to rotate */ static inline struct perf_event * ctx_event_to_rotate(struct perf_event_context *ctx) { struct perf_event *event; /* pick the first active flexible event */ event = list_first_entry_or_null(&ctx->flexible_active, struct perf_event, active_list); /* if no active flexible event, pick the first event */ if (!event) { event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), typeof(*event), group_node); } /* * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() * finds there are unschedulable events, it will set it again. */ ctx->rotate_necessary = 0; return event; } static bool perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event *cpu_event = NULL, *task_event = NULL; struct perf_event_context *task_ctx = NULL; int cpu_rotate, task_rotate; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ cpu_rotate = cpuctx->ctx.rotate_necessary; task_ctx = cpuctx->task_ctx; task_rotate = task_ctx ? task_ctx->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); if (task_rotate) task_event = ctx_event_to_rotate(task_ctx); if (cpu_rotate) cpu_event = ctx_event_to_rotate(&cpuctx->ctx); /* * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ if (task_event || (task_ctx && cpu_event)) ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE); if (cpu_event) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (task_event) rotate_ctx(task_ctx, task_event); if (cpu_event) rotate_ctx(&cpuctx->ctx, cpu_event); perf_event_sched_in(cpuctx, task_ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); return true; } void perf_event_task_tick(void) { struct list_head *head = this_cpu_ptr(&active_ctx_list); struct perf_event_context *ctx, *tmp; int throttled; lockdep_assert_irqs_disabled(); __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) perf_adjust_freq_unthr_context(ctx, throttled); } static int event_enable_on_exec(struct perf_event *event, struct perf_event_context *ctx) { if (!event->attr.enable_on_exec) return 0; event->attr.enable_on_exec = 0; if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); return 1; } /* * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ static void perf_event_enable_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; int enabled = 0; local_irq_save(flags); ctx = current->perf_event_ctxp[ctxn]; if (!ctx || !ctx->nr_events) goto out; cpuctx = __get_cpu_context(ctx); perf_ctx_lock(cpuctx, ctx); ctx_sched_out(ctx, cpuctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); event_type |= get_event_type(event); } /* * Unclone and reschedule this context if we enabled any event. */ if (enabled) { clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); } else { ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); } perf_ctx_unlock(cpuctx, ctx); out: local_irq_restore(flags); if (clone_ctx) put_ctx(clone_ctx); } struct perf_read_data { struct perf_event *event; bool group; int ret; }; static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { u16 local_pkg, event_pkg; if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { int local_cpu = smp_processor_id(); event_pkg = topology_physical_package_id(event_cpu); local_pkg = topology_physical_package_id(local_cpu); if (event_pkg == local_pkg) return local_cpu; } return event_cpu; } /* * Cross CPU call to read the hardware event */ static void __perf_event_read(void *info) { struct perf_read_data *data = info; struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct pmu *pmu = event->pmu; /* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. In that case * event->count would have been updated to a recent sample * when the event was scheduled out. */ if (ctx->task && cpuctx->task_ctx != ctx) return; raw_spin_lock(&ctx->lock); if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } perf_event_update_time(event); if (data->group) perf_event_update_sibling_time(event); if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; if (!data->group) { pmu->read(event); data->ret = 0; goto unlock; } pmu->start_txn(pmu, PERF_PMU_TXN_READ); pmu->read(event); for_each_sibling_event(sub, event) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since * sibling could be on different (eg: software) PMU. */ sub->pmu->read(sub); } } data->ret = pmu->commit_txn(pmu); unlock: raw_spin_unlock(&ctx->lock); } static inline u64 perf_event_count(struct perf_event *event) { return local64_read(&event->count) + atomic64_read(&event->child_count); } static void calc_timer_values(struct perf_event *event, u64 *now, u64 *enabled, u64 *running) { u64 ctx_time; *now = perf_clock(); ctx_time = perf_event_time_now(event, *now); __perf_update_times(event, ctx_time, enabled, running); } /* * NMI-safe method to read a local event, that is an event that * is: * - either for the current task, or for this CPU * - does not have inherit set, for inherited task events * will not be local and we cannot read them atomically * - must not have a pmu::count method */ int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; /* * Disabling interrupts avoids all counter scheduling (context * switches, timer based rotation and IPIs). */ local_irq_save(flags); /* * It must not be an event with inherit set, we cannot read * all child counters from atomic context. */ if (event->attr.inherit) { ret = -EOPNOTSUPP; goto out; } /* If this is a per-task event, it must be for current */ if ((event->attach_state & PERF_ATTACH_TASK) && event->hw.target != current) { ret = -EINVAL; goto out; } /* If this is a per-CPU event, it must be for this CPU */ if (!(event->attach_state & PERF_ATTACH_TASK) && event->cpu != smp_processor_id()) { ret = -EINVAL; goto out; } /* If this is a pinned event it must be running on this CPU */ if (event->attr.pinned && event->oncpu != smp_processor_id()) { ret = -EBUSY; goto out; } /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ if (event->oncpu == smp_processor_id()) event->pmu->read(event); *value = local64_read(&event->count); if (enabled || running) { u64 __enabled, __running, __now;; calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) *running = __running; } out: local_irq_restore(flags); return ret; } static int perf_event_read(struct perf_event *event, bool group) { enum perf_event_state state = READ_ONCE(event->state); int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ again: if (state == PERF_EVENT_STATE_ACTIVE) { struct perf_read_data data; /* * Orders the ->state and ->oncpu loads such that if we see * ACTIVE we must also see the right ->oncpu. * * Matches the smp_wmb() from event_sched_in(). */ smp_rmb(); event_cpu = READ_ONCE(event->oncpu); if ((unsigned)event_cpu >= nr_cpu_ids) return 0; data = (struct perf_read_data){ .event = event, .group = group, .ret = 0, }; preempt_disable(); event_cpu = __perf_event_read_cpu(event, event_cpu); /* * Purposely ignore the smp_call_function_single() return * value. * * If event_cpu isn't a valid CPU it means the event got * scheduled out and that will have updated the event count. * * Therefore, either way, we'll have an up-to-date event count * after this. */ (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); preempt_enable(); ret = data.ret; } else if (state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); state = event->state; if (state != PERF_EVENT_STATE_INACTIVE) { raw_spin_unlock_irqrestore(&ctx->lock, flags); goto again; } /* * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time */ if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } perf_event_update_time(event); if (group) perf_event_update_sibling_time(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ret; } /* * Initialize the perf_event context in a task_struct: */ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->active_ctx_list); perf_event_groups_init(&ctx->pinned_groups); perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); INIT_LIST_HEAD(&ctx->pinned_active); INIT_LIST_HEAD(&ctx->flexible_active); refcount_set(&ctx->refcount, 1); } static struct perf_event_context * alloc_perf_context(struct pmu *pmu, struct task_struct *task) { struct perf_event_context *ctx; ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); if (!ctx) return NULL; __perf_event_init_context(ctx); if (task) ctx->task = get_task_struct(task); ctx->pmu = pmu; return ctx; } static struct task_struct * find_lively_task_by_vpid(pid_t vpid) { struct task_struct *task; rcu_read_lock(); if (!vpid) task = current; else task = find_task_by_vpid(vpid); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) return ERR_PTR(-ESRCH); return task; } /* * Returns a matching context with refcount and pincount. */ static struct perf_event_context * find_get_context(struct pmu *pmu, struct task_struct *task, struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; void *task_ctx_data = NULL; unsigned long flags; int ctxn, err; int cpu = event->cpu; if (!task) { /* Must be root to operate on a CPU event: */ err = perf_allow_cpu(&event->attr); if (err) return ERR_PTR(err); cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); raw_spin_lock_irqsave(&ctx->lock, flags); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); return ctx; } err = -EINVAL; ctxn = pmu->task_ctx_nr; if (ctxn < 0) goto errout; if (event->attach_state & PERF_ATTACH_TASK_DATA) { task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { err = -ENOMEM; goto errout; } } retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; if (task_ctx_data && !ctx->task_ctx_data) { ctx->task_ctx_data = task_ctx_data; task_ctx_data = NULL; } raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); } else { ctx = alloc_perf_context(pmu, task); err = -ENOMEM; if (!ctx) goto errout; if (task_ctx_data) { ctx->task_ctx_data = task_ctx_data; task_ctx_data = NULL; } err = 0; mutex_lock(&task->perf_event_mutex); /* * If it has already passed perf_event_exit_task(). * we must see PF_EXITING, it takes this mutex too. */ if (task->flags & PF_EXITING) err = -ESRCH; else if (task->perf_event_ctxp[ctxn]) err = -EAGAIN; else { get_ctx(ctx); ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { put_ctx(ctx); if (err == -EAGAIN) goto retry; goto errout; } } free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } static void perf_event_free_filter(struct perf_event *event); static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { struct perf_event *event; event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); kfree(event); } static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb); static void detach_sb_event(struct perf_event *event) { struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); raw_spin_lock(&pel->lock); list_del_rcu(&event->sb_list); raw_spin_unlock(&pel->lock); } static bool is_sb_event(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; if (event->parent) return false; if (event->attach_state & PERF_ATTACH_TASK) return false; if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; } static void unaccount_pmu_sb_event(struct perf_event *event) { if (is_sb_event(event)) detach_sb_event(event); } static void unaccount_event_cpu(struct perf_event *event, int cpu) { if (event->parent) return; if (is_cgroup_event(event)) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); } #ifdef CONFIG_NO_HZ_FULL static DEFINE_SPINLOCK(nr_freq_lock); #endif static void unaccount_freq_event_nohz(void) { #ifdef CONFIG_NO_HZ_FULL spin_lock(&nr_freq_lock); if (atomic_dec_and_test(&nr_freq_events)) tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); spin_unlock(&nr_freq_lock); #endif } static void unaccount_freq_event(void) { if (tick_nohz_full_enabled()) unaccount_freq_event_nohz(); else atomic_dec(&nr_freq_events); } static void unaccount_event(struct perf_event *event) { bool dec = false; if (event->parent) return; if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) atomic_dec(&nr_namespaces_events); if (event->attr.cgroup) atomic_dec(&nr_cgroup_events); if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) unaccount_freq_event(); if (event->attr.context_switch) { dec = true; atomic_dec(&nr_switch_events); } if (is_cgroup_event(event)) dec = true; if (has_branch_stack(event)) dec = true; if (event->attr.ksymbol) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); if (event->attr.text_poke) atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) schedule_delayed_work(&perf_sched_work, HZ); } unaccount_event_cpu(event, event->cpu); unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) { mutex_lock(&perf_sched_mutex); if (atomic_dec_and_test(&perf_sched_count)) static_branch_disable(&perf_sched_events); mutex_unlock(&perf_sched_mutex); } /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled * at a time, so we disallow creating events that might conflict, namely: * * 1) cpu-wide events in the presence of per-task events, * 2) per-task events in the presence of cpu-wide events, * 3) two matching events on the same context. * * The former two cases are handled in the allocation path (perf_event_alloc(), * _free_event()), the latter -- before the first perf_install_in_context(). */ static int exclusive_event_init(struct perf_event *event) { struct pmu *pmu = event->pmu; if (!is_exclusive_pmu(pmu)) return 0; /* * Prevent co-existence of per-task and cpu-wide events on the * same exclusive pmu. * * Negative pmu::exclusive_cnt means there are cpu-wide * events on this "exclusive" pmu, positive means there are * per-task events. * * Since this is called in perf_event_alloc() path, event::ctx * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK * to mean "per-task event", because unlike other attach states it * never gets cleared. */ if (event->attach_state & PERF_ATTACH_TASK) { if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) return -EBUSY; } else { if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) return -EBUSY; } return 0; } static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; if (!is_exclusive_pmu(pmu)) return; /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) return true; return false; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *iter_event; struct pmu *pmu = event->pmu; lockdep_assert_held(&ctx->mutex); if (!is_exclusive_pmu(pmu)) return true; list_for_each_entry(iter_event, &ctx->event_list, event_entry) { if (exclusive_event_match(iter_event, event)) return false; } return true; } static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head); static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); unaccount_event(event); security_perf_event_free(event); if (event->rb) { /* * Can happen when we close an event with re-directed output. * * Since we have a 0 refcount, perf_mmap_close() will skip * over us; possibly making our ring_buffer_put() the last. */ mutex_lock(&event->mmap_mutex); ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); } if (is_cgroup_event(event)) perf_detach_cgroup(event); if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); } perf_event_free_bpf_prog(event); perf_addr_filters_splice(event, NULL); kfree(event->addr_filter_ranges); if (event->destroy) event->destroy(event); /* * Must be after ->destroy(), due to uprobe_perf_close() using * hw.target. */ if (event->hw.target) put_task_struct(event->hw.target); /* * perf_event_free_task() relies on put_ctx() being 'last', in particular * all task references must be cleaned up. */ if (event->ctx) put_ctx(event->ctx); exclusive_event_destroy(event); module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } /* * Used to free events which have a known refcount of 1, such as in error paths * where the event isn't exposed yet and inherited events. */ static void free_event(struct perf_event *event) { if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, "unexpected event refcount: %ld; ptr=%p\n", atomic_long_read(&event->refcount), event)) { /* leak to avoid use-after-free */ return; } _free_event(event); } /* * Remove user event from the owner task. */ static void perf_remove_from_owner(struct perf_event *event) { struct task_struct *owner; rcu_read_lock(); /* * Matches the smp_store_release() in perf_event_exit_task(). If we * observe !owner it means the list deletion is complete and we can * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last * task reference we can safely take a new reference * while holding the rcu_read_lock(). */ get_task_struct(owner); } rcu_read_unlock(); if (owner) { /* * If we're here through perf_event_exit_task() we're already * holding ctx->mutex which would be an inversion wrt. the * normal lock order. * * However we can safely take this lock because its the child * ctx->mutex. */ mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); /* * We have to re-check the event->owner field, if it is cleared * we raced with perf_event_exit_task(), acquiring the mutex * ensured they're done, and we can proceed with freeing the * event. */ if (event->owner) { list_del_init(&event->owner_entry); smp_store_release(&event->owner, NULL); } mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } } static void put_event(struct perf_event *event) { if (!atomic_long_dec_and_test(&event->refcount)) return; _free_event(event); } /* * Kill an event dead; while event:refcount will preserve the event * object, it will not preserve its functionality. Once the last 'user' * gives up the object, we'll destroy the thing. */ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; LIST_HEAD(free_list); /* * If we got here through err_file: fput(event_file); we will not have * attached to a context yet. */ if (!ctx) { WARN_ON_ONCE(event->attach_state & (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); goto no_ctx; } if (!is_kernel_event(event)) perf_remove_from_owner(event); ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); perf_remove_from_context(event, DETACH_GROUP); raw_spin_lock_irq(&ctx->lock); /* * Mark this event as STATE_DEAD, there is no external reference to it * anymore. * * Anybody acquiring event->child_mutex after the below loop _must_ * also see this, most importantly inherit_event() which will avoid * placing more children on the list. * * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ event->state = PERF_EVENT_STATE_DEAD; raw_spin_unlock_irq(&ctx->lock); perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. * * Since the event cannot get freed while we hold the * child_mutex, the context must also exist and have a !0 * reference count. */ get_ctx(ctx); /* * Now that we have a ctx ref, we can drop child_mutex, and * acquire ctx::mutex without fear of it going away. Then we * can re-acquire child_mutex. */ mutex_unlock(&event->child_mutex); mutex_lock(&ctx->mutex); mutex_lock(&event->child_mutex); /* * Now that we hold ctx::mutex and child_mutex, revalidate our * state, if child is still the first entry, it didn't get freed * and we can continue doing so. */ tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); list_move(&child->child_list, &free_list); /* * This matches the refcount bump in inherit_event(); * this can't be the last reference. */ put_event(event); } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); put_ctx(ctx); goto again; } mutex_unlock(&event->child_mutex); list_for_each_entry_safe(child, tmp, &free_list, child_list) { void *var = &child->ctx->refcount; list_del(&child->child_list); free_event(child); /* * Wake any perf_event_free_task() waiting for this event to be * freed. */ smp_mb(); /* pairs with wait_var_event() */ wake_up_var(var); } no_ctx: put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ static int perf_release(struct inode *inode, struct file *file) { perf_event_release_kernel(file->private_data); return 0; } static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; *enabled = 0; *running = 0; mutex_lock(&event->child_mutex); (void)perf_event_read(event, false); total += perf_event_count(event); *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); *running += event->total_time_running + atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); total += perf_event_count(child); *enabled += child->total_time_enabled; *running += child->total_time_running; } mutex_unlock(&event->child_mutex); return total; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event_context *ctx; u64 count; ctx = perf_event_ctx_lock(event); count = __perf_event_read_value(event, enabled, running); perf_event_ctx_unlock(event, ctx); return count; } EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event_context *ctx = leader->ctx; struct perf_event *sub, *parent; unsigned long flags; int n = 1; /* skip @nr */ int ret; ret = perf_event_read(leader, true); if (ret) return ret; raw_spin_lock_irqsave(&ctx->lock, flags); /* * Verify the grouping between the parent and child (inherited) * events is still in tact. * * Specifically: * - leader->ctx->lock pins leader->sibling_list * - parent->child_mutex pins parent->child_list * - parent->ctx->mutex pins parent->sibling_list * * Because parent->ctx != leader->ctx (and child_list nests inside * ctx->mutex), group destruction is not atomic between children, also * see perf_event_release_kernel(). Additionally, parent can grow the * group. * * Therefore it is possible to have parent and child groups in a * different configuration and summing over such a beast makes no sense * what so ever. * * Reject this. */ parent = leader->parent; if (parent && (parent->group_generation != leader->group_generation || parent->nr_siblings != leader->nr_siblings)) { ret = -ECHILD; goto unlock; } /* * Since we co-schedule groups, {enabled,running} times of siblings * will be identical to those of the leader, so we only publish one * set. */ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] += leader->total_time_enabled + atomic64_read(&leader->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { values[n++] += leader->total_time_running + atomic64_read(&leader->child_total_time_running); } /* * Write {count,id} tuples for every sibling. */ values[n++] += perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&sub->lost_samples); } unlock: raw_spin_unlock_irqrestore(&ctx->lock, flags); return ret; } static int perf_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *child; struct perf_event_context *ctx = leader->ctx; int ret; u64 *values; lockdep_assert_held(&ctx->mutex); values = kzalloc(event->read_size, GFP_KERNEL); if (!values) return -ENOMEM; values[0] = 1 + leader->nr_siblings; mutex_lock(&leader->child_mutex); ret = __perf_read_group_add(leader, read_format, values); if (ret) goto unlock; list_for_each_entry(child, &leader->child_list, child_list) { ret = __perf_read_group_add(child, read_format, values); if (ret) goto unlock; } mutex_unlock(&leader->child_mutex); ret = event->read_size; if (copy_to_user(buf, values, event->read_size)) ret = -EFAULT; goto out; unlock: mutex_unlock(&leader->child_mutex); out: kfree(values); return ret; } static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; u64 values[5]; int n = 0; values[n++] = __perf_event_read_value(event, &enabled, &running); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&event->lost_samples); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; return n * sizeof(u64); } static bool is_event_hup(struct perf_event *event) { bool no_children; if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); no_children = list_empty(&event->child_list); mutex_unlock(&event->child_mutex); return no_children; } /* * Read the performance event - simple non blocking version for now */ static ssize_t __perf_read(struct perf_event *event, char __user *buf, size_t count) { u64 read_format = event->attr.read_format; int ret; /* * Return end-of-file for a read on an event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ if (event->state == PERF_EVENT_STATE_ERROR) return 0; if (count < event->read_size) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); if (read_format & PERF_FORMAT_GROUP) ret = perf_read_group(event, read_format, buf); else ret = perf_read_one(event, read_format, buf); return ret; } static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_event *event = file->private_data; struct perf_event_context *ctx; int ret; ret = security_perf_event_read(event); if (ret) return ret; ctx = perf_event_ctx_lock(event); ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); return ret; } static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct perf_buffer *rb; __poll_t events = EPOLLHUP; poll_wait(file, &event->waitq, wait); if (is_event_hup(event)) return events; /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. */ mutex_lock(&event->mmap_mutex); rb = event->rb; if (rb) events = atomic_xchg(&rb->poll, 0); mutex_unlock(&event->mmap_mutex); return events; } static void _perf_event_reset(struct perf_event *event) { (void)perf_event_read(event, false); local64_set(&event->count, 0); perf_event_update_userpage(event); } /* Assume it's not an event with inherit set. */ u64 perf_event_pause(struct perf_event *event, bool reset) { struct perf_event_context *ctx; u64 count; ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(event->attr.inherit); _perf_event_disable(event); count = local64_read(&event->count); if (reset) local64_set(&event->count, 0); perf_event_ctx_unlock(event, ctx); return count; } EXPORT_SYMBOL_GPL(perf_event_pause); /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable. */ static void perf_event_for_each_child(struct perf_event *event, void (*func)(struct perf_event *)) { struct perf_event *child; WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); func(event); list_for_each_entry(child, &event->child_list, child_list) func(child); mutex_unlock(&event->child_mutex); } static void perf_event_for_each(struct perf_event *event, void (*func)(struct perf_event *)) { struct perf_event_context *ctx = event->ctx; struct perf_event *sibling; lockdep_assert_held(&ctx->mutex); event = event->group_leader; perf_event_for_each_child(event, func); for_each_sibling_event(sibling, event) perf_event_for_each_child(sibling, func); } static void __perf_event_period(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { u64 value = *((u64 *)info); bool active; if (event->attr.freq) { event->attr.sample_freq = value; } else { event->attr.sample_period = value; event->hw.sample_period = value; } active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(ctx->pmu); /* * We could be throttled; unthrottle now to avoid the tick * trying to unthrottle while we already re-started the event. */ if (event->hw.interrupts == MAX_INTERRUPTS) { event->hw.interrupts = 0; perf_log_throttle(event, 1); } event->pmu->stop(event, PERF_EF_UPDATE); } local64_set(&event->hw.period_left, 0); if (active) { event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } } static int perf_event_check_period(struct perf_event *event, u64 value) { return event->pmu->check_period(event, value); } static int _perf_event_period(struct perf_event *event, u64 value) { if (!is_sampling_event(event)) return -EINVAL; if (!value) return -EINVAL; if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; if (perf_event_check_period(event, value)) return -EINVAL; if (!event->attr.freq && (value & (1ULL << 63))) return -EINVAL; event_function_call(event, __perf_event_period, &value); return 0; } int perf_event_period(struct perf_event *event, u64 value) { struct perf_event_context *ctx; int ret; ctx = perf_event_ctx_lock(event); ret = _perf_event_period(event, value); perf_event_ctx_unlock(event, ctx); return ret; } EXPORT_SYMBOL_GPL(perf_event_period); static const struct file_operations perf_fops; static inline int perf_fget_light(int fd, struct fd *p) { struct fd f = fdget(fd); if (!f.file) return -EBADF; if (f.file->f_op != &perf_fops) { fdput(f); return -EBADF; } *p = f; return 0; } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; break; case PERF_EVENT_IOC_DISABLE: func = _perf_event_disable; break; case PERF_EVENT_IOC_RESET: func = _perf_event_reset; break; case PERF_EVENT_IOC_REFRESH: return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: { u64 value; if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) return -EFAULT; return _perf_event_period(event, value); } case PERF_EVENT_IOC_ID: { u64 id = primary_event_id(event); if (copy_to_user((void __user *)arg, &id, sizeof(id))) return -EFAULT; return 0; } case PERF_EVENT_IOC_SET_OUTPUT: { int ret; if (arg != -1) { struct perf_event *output_event; struct fd output; ret = perf_fget_light(arg, &output); if (ret) return ret; output_event = output.file->private_data; ret = perf_event_set_output(event, output_event); fdput(output); } else { ret = perf_event_set_output(event, NULL); } return ret; } case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); case PERF_EVENT_IOC_SET_BPF: return perf_event_set_bpf_prog(event, arg); case PERF_EVENT_IOC_PAUSE_OUTPUT: { struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb || !rb->nr_pages) { rcu_read_unlock(); return -EINVAL; } rb_toggle_paused(rb, !!arg); rcu_read_unlock(); return 0; } case PERF_EVENT_IOC_QUERY_BPF: return perf_event_query_prog_array(event, (void __user *)arg); case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { struct perf_event_attr new_attr; int err = perf_copy_attr((struct perf_event_attr __user *)arg, &new_attr); if (err) return err; return perf_event_modify_attr(event, &new_attr); } default: return -ENOTTY; } if (flags & PERF_IOC_FLAG_GROUP) perf_event_for_each(event, func); else perf_event_for_each_child(event, func); return 0; } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_event *event = file->private_data; struct perf_event_context *ctx; long ret; /* Treat ioctl like writes as it is likely a mutating operation. */ ret = security_perf_event_write(event); if (ret) return ret; ctx = perf_event_ctx_lock(event); ret = _perf_ioctl(event, cmd, arg); perf_event_ctx_unlock(event, ctx); return ret; } #ifdef CONFIG_COMPAT static long perf_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (_IOC_NR(cmd)) { case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): case _IOC_NR(PERF_EVENT_IOC_ID): case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF): case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES): /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { cmd &= ~IOCSIZE_MASK; cmd |= sizeof(void *) << IOCSIZE_SHIFT; } break; } return perf_ioctl(file, cmd, arg); } #else # define perf_compat_ioctl NULL #endif int perf_event_task_enable(void) { struct perf_event_context *ctx; struct perf_event *event; mutex_lock(&current->perf_event_mutex); list_for_each_entry(event, &current->perf_event_list, owner_entry) { ctx = perf_event_ctx_lock(event); perf_event_for_each_child(event, _perf_event_enable); perf_event_ctx_unlock(event, ctx); } mutex_unlock(&current->perf_event_mutex); return 0; } int perf_event_task_disable(void) { struct perf_event_context *ctx; struct perf_event *event; mutex_lock(&current->perf_event_mutex); list_for_each_entry(event, &current->perf_event_list, owner_entry) { ctx = perf_event_ctx_lock(event); perf_event_for_each_child(event, _perf_event_disable); perf_event_ctx_unlock(event, ctx); } mutex_unlock(&current->perf_event_mutex); return 0; } static int perf_event_index(struct perf_event *event) { if (event->hw.state & PERF_HES_STOPPED) return 0; if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; return event->pmu->event_idx(event); } static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; userpg = rb->user_page; /* Allow new userspace to detect that bit 0 is deprecated */ userpg->cap_bit0_is_deprecated = 1; userpg->size = offsetof(struct perf_event_mmap_page, __reserved); userpg->data_offset = PAGE_SIZE; userpg->data_size = perf_data_size(rb); unlock: rcu_read_unlock(); } void __weak arch_perf_update_userpage( struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { } /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch * code calls this from NMI context. */ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct perf_buffer *rb; u64 enabled, running, now; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we can be called in * NMI context */ calc_timer_values(event, &now, &enabled, &running); userpg = rb->user_page; /* * Disable preemption to guarantee consistent time stamps are stored to * the user page. */ preempt_disable(); ++userpg->lock; barrier(); userpg->index = perf_event_index(event); userpg->offset = perf_event_count(event); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = enabled + atomic64_read(&event->child_total_time_enabled); userpg->time_running = running + atomic64_read(&event->child_total_time_running); arch_perf_update_userpage(event, userpg, now); barrier(); ++userpg->lock; preempt_enable(); unlock: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(perf_event_update_userpage); static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) { struct perf_event *event = vmf->vma->vm_file->private_data; struct perf_buffer *rb; vm_fault_t ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { if (vmf->pgoff == 0) ret = 0; return ret; } rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; vmf->page = perf_mmap_to_page(rb, vmf->pgoff); if (!vmf->page) goto unlock; get_page(vmf->page); vmf->page->mapping = vmf->vma->vm_file->f_mapping; vmf->page->index = vmf->pgoff; ret = 0; unlock: rcu_read_unlock(); return ret; } static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb) { struct perf_buffer *old_rb = NULL; unsigned long flags; WARN_ON_ONCE(event->parent); if (event->rb) { /* * Should be impossible, we set this when removing * event->rb_entry and wait/clear when adding event->rb_entry. */ WARN_ON_ONCE(event->rcu_pending); old_rb = event->rb; spin_lock_irqsave(&old_rb->event_lock, flags); list_del_rcu(&event->rb_entry); spin_unlock_irqrestore(&old_rb->event_lock, flags); event->rcu_batches = get_state_synchronize_rcu(); event->rcu_pending = 1; } if (rb) { if (event->rcu_pending) { cond_synchronize_rcu(event->rcu_batches); event->rcu_pending = 0; } spin_lock_irqsave(&rb->event_lock, flags); list_add_rcu(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); } /* * Avoid racing with perf_mmap_close(AUX): stop the event * before swizzling the event::rb pointer; if it's getting * unmapped, its aux_mmap_count will be 0 and it won't * restart. See the comment in __perf_pmu_output_stop(). * * Data will inevitably be lost when set_output is done in * mid-air, but then again, whoever does it like this is * not in for the data anyway. */ if (has_aux(event)) perf_event_stop(event, 0); rcu_assign_pointer(event->rb, rb); if (old_rb) { ring_buffer_put(old_rb); /* * Since we detached before setting the new rb, so that we * could attach the new rb, we could have missed a wakeup. * Provide it now. */ wake_up_all(&event->waitq); } } static void ring_buffer_wakeup(struct perf_event *event) { struct perf_buffer *rb; if (event->parent) event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { list_for_each_entry_rcu(event, &rb->event_list, rb_entry) wake_up_all(&event->waitq); } rcu_read_unlock(); } struct perf_buffer *ring_buffer_get(struct perf_event *event) { struct perf_buffer *rb; if (event->parent) event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { if (!refcount_inc_not_zero(&rb->refcount)) rb = NULL; } rcu_read_unlock(); return rb; } void ring_buffer_put(struct perf_buffer *rb) { if (!refcount_dec_and_test(&rb->refcount)) return; WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); if (vma->vm_pgoff) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). * * In order to undo the VM accounting done by perf_mmap() we need to destroy * the buffer here, where we still have a VM context. This means we need * to detach all events redirecting to us. */ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); bool detach_rest = false; if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and * event->mmap_count, so it is ok to use event->mmap_mutex to * serialize with perf_mmap here. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU * data. Note that after rb::aux_mmap_count dropped to zero, * they won't start any more (see perf_aux_output_begin()). */ perf_pmu_output_stop(event); /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm); atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); mutex_unlock(&event->mmap_mutex); } if (atomic_dec_and_test(&rb->mmap_count)) detach_rest = true; if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ if (!detach_rest) goto out_put; /* * No other mmap()s, detach from all other events that might redirect * into the now unreachable buffer. Somewhat complicated by the * fact that rb::event_lock otherwise nests inside mmap_mutex. */ again: rcu_read_lock(); list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { if (!atomic_long_inc_not_zero(&event->refcount)) { /* * This event is en-route to free_event() which will * detach it and remove it from the list. */ continue; } rcu_read_unlock(); mutex_lock(&event->mmap_mutex); /* * Check we didn't race with perf_event_set_output() which can * swizzle the rb from under us while we were waiting to * acquire mmap_mutex. * * If we find a different rb; ignore this event, a next * iteration will no longer find it on the list. We have to * still restart the iteration to make sure we're not now * iterating the wrong list. */ if (event->rb == rb) ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); put_event(event); /* * Restart the iteration; either we're on the wrong list or * destroyed its integrity by doing a deletion. */ goto again; } rcu_read_unlock(); /* * It could be there's still a few 0-ref events on the list; they'll * get cleaned up by free_event() -- they'll also still have their * ref on the rb and will free it whenever they are done with it. * * Aside from that, this buffer is 'fully' detached and unmapped, * undo the VM accounting. */ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, &mmap_user->locked_vm); atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: ring_buffer_put(rb); /* could be last */ } static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, }; static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); struct perf_buffer *rb = NULL; unsigned long locked, lock_limit; unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; int ret = 0, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would * create a performance issue due to all children writing to the * same rb. */ if (event->cpu == -1 && event->attr.inherit) return -EINVAL; if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; ret = security_perf_event_read(event); if (ret) return ret; vma_size = vma->vm_end - vma->vm_start; if (vma->vm_pgoff == 0) { nr_pages = (vma_size / PAGE_SIZE) - 1; } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already * mapped, all subsequent mappings should have the same size * and offset. Must be above the normal perf buffer. */ u64 aux_offset, aux_size; if (!event->rb) return -EINVAL; nr_pages = vma_size / PAGE_SIZE; mutex_lock(&event->mmap_mutex); ret = -EINVAL; rb = event->rb; if (!rb) goto aux_unlock; aux_offset = READ_ONCE(rb->user_page->aux_offset); aux_size = READ_ONCE(rb->user_page->aux_size); if (aux_offset < perf_data_size(rb) + PAGE_SIZE) goto aux_unlock; if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) goto aux_unlock; /* already mapped with a different offset */ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) goto aux_unlock; if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) goto aux_unlock; /* already mapped with a different size */ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) goto aux_unlock; if (!is_power_of_2(nr_pages)) goto aux_unlock; if (!atomic_inc_not_zero(&rb->mmap_count)) goto aux_unlock; if (rb_has_aux(rb)) { atomic_inc(&rb->aux_mmap_count); ret = 0; goto unlock; } atomic_set(&rb->aux_mmap_count, 1); user_extra = nr_pages; goto accounting; } /* * If we have rb pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) return -EINVAL; if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); again: mutex_lock(&event->mmap_mutex); if (event->rb) { if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } if (!atomic_inc_not_zero(&event->rb->mmap_count)) { /* * Raced against perf_mmap_close(); remove the * event and try again. */ ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); goto again; } goto unlock; } user_extra = nr_pages + 1; accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* * Increase the limit linearly with more CPUs: */ user_lock_limit *= num_online_cpus(); user_locked = atomic_long_read(&user->locked_vm); /* * sysctl_perf_event_mlock may have changed, so that * user->locked_vm > user_lock_limit */ if (user_locked > user_lock_limit) user_locked = user_lock_limit; user_locked += user_extra; if (user_locked > user_lock_limit) { /* * charge locked_vm until it hits user_lock_limit; * charge the rest from pinned_vm */ extra = user_locked - user_lock_limit; user_extra -= extra; } lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_is_paranoid() && !capable(CAP_IPC_LOCK)) { ret = -EPERM; goto unlock; } WARN_ON(!rb && event->rb); if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; if (!rb) { rb = rb_alloc(nr_pages, event->attr.watermark ? event->attr.wakeup_watermark : 0, event->cpu, flags); if (!rb) { ret = -ENOMEM; goto unlock; } atomic_set(&rb->mmap_count, 1); rb->mmap_user = get_current_user(); rb->mmap_locked = extra; ring_buffer_attach(event, rb); perf_event_update_time(event); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); if (!ret) rb->aux_mmap_locked = extra; } unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { atomic_dec(&rb->mmap_count); } aux_unlock: mutex_unlock(&event->mmap_mutex); /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); return ret; } static int perf_fasync(int fd, struct file *filp, int on) { struct inode *inode = file_inode(filp); struct perf_event *event = filp->private_data; int retval; inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); if (retval < 0) return retval; return 0; } static const struct file_operations perf_fops = { .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, }; /* * Perf event wakeup * * If there's data, ensure we set the poll() state and publish everything * to user-space before waking everybody up. */ static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) { /* only the parent has fasync state */ if (event->parent) event = event->parent; return &event->fasync; } void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); if (event->pending_kill) { kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); event->pending_kill = 0; } } static void perf_pending_event_disable(struct perf_event *event) { int cpu = READ_ONCE(event->pending_disable); if (cpu < 0) return; if (cpu == smp_processor_id()) { WRITE_ONCE(event->pending_disable, -1); perf_event_disable_local(event); return; } /* * CPU-A CPU-B * * perf_event_disable_inatomic() * @pending_disable = CPU-A; * irq_work_queue(); * * sched-out * @pending_disable = -1; * * sched-in * perf_event_disable_inatomic() * @pending_disable = CPU-B; * irq_work_queue(); // FAILS * * irq_work_run() * perf_pending_event() * * But the event runs on CPU-B and wants disabling there. */ irq_work_queue_on(&event->pending, cpu); } static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); int rctx; rctx = perf_swevent_get_recursion_context(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ perf_pending_event_disable(event); if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); } if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } /* * We assume there is only KVM supporting the callbacks. * Later on, we might change it to a list if there is * another virtualization implementation supporting the callbacks. */ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) return -EBUSY; rcu_assign_pointer(perf_guest_cbs, cbs); return 0; } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) return -EINVAL; rcu_assign_pointer(perf_guest_cbs, NULL); synchronize_rcu(); return 0; } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); static void perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) { int bit; DECLARE_BITMAP(_mask, 64); bitmap_from_u64(_mask, mask); for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { u64 val; val = perf_reg_value(regs, bit); perf_output_put(handle, val); } } static void perf_sample_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; } else if (!(current->flags & PF_KTHREAD)) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; } } static void perf_sample_regs_intr(struct perf_regs *regs_intr, struct pt_regs *regs) { regs_intr->regs = regs; regs_intr->abi = perf_reg_abi(current); } /* * Get remaining task size from user stack pointer. * * It'd be better to take stack vma map and limit this more * precisely, but there's no way to get it safely under interrupt, * so using TASK_SIZE as limit. */ static u64 perf_ustack_task_size(struct pt_regs *regs) { unsigned long addr = perf_user_stack_pointer(regs); if (!addr || addr >= TASK_SIZE) return 0; return TASK_SIZE - addr; } static u16 perf_sample_ustack_size(u16 stack_size, u16 header_size, struct pt_regs *regs) { u64 task_size; /* No regs, no stack pointer, no dump. */ if (!regs) return 0; /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE * If we don't, we limit the size to the TASK_SIZE. * * - remaining sample size * If we don't, we customize the stack size to * fit in to the remaining sample size. */ task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); stack_size = min(stack_size, (u16) task_size); /* Current header size plus static size and dynamic size. */ header_size += 2 * sizeof(u64); /* Do we fit in with the current stack dump size? */ if ((u16) (header_size + stack_size) < header_size) { /* * If we overflow the maximum size for the sample, * we customize the stack dump size to fit in. */ stack_size = USHRT_MAX - header_size - sizeof(u64); stack_size = round_up(stack_size, sizeof(u64)); } return stack_size; } static void perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, struct pt_regs *regs) { /* Case of a kernel thread, nothing to dump */ if (!regs) { u64 size = 0; perf_output_put(handle, size); } else { unsigned long sp; unsigned int rem; u64 dyn_size; mm_segment_t fs; /* * We dump: * static size * - the size requested by user or the best one we can fit * in to the sample max size * data * - user stack dump data * dynamic size * - the actual dumped size */ /* Static size. */ perf_output_put(handle, dump_size); /* Data. */ sp = perf_user_stack_pointer(regs); fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); /* Dynamic size. */ perf_output_put(handle, dyn_size); } } static unsigned long perf_prepare_sample_aux(struct perf_event *event, struct perf_sample_data *data, size_t size) { struct perf_event *sampler = event->aux_event; struct perf_buffer *rb; data->aux_size = 0; if (!sampler) goto out; if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) goto out; if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out; rb = ring_buffer_get(sampler); if (!rb) goto out; /* * If this is an NMI hit inside sampling code, don't take * the sample. See also perf_aux_sample_output(). */ if (READ_ONCE(rb->aux_in_sampling)) { data->aux_size = 0; } else { size = min_t(size_t, size, perf_aux_size(rb)); data->aux_size = ALIGN(size, sizeof(u64)); } ring_buffer_put(rb); out: return data->aux_size; } long perf_pmu_snapshot_aux(struct perf_buffer *rb, struct perf_event *event, struct perf_output_handle *handle, unsigned long size) { unsigned long flags; long ret; /* * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler * paths. If we start calling them in NMI context, they may race with * the IRQ ones, that is, for example, re-starting an event that's just * been stopped, which is why we're using a separate callback that * doesn't change the event state. * * IRQs need to be disabled to prevent IPIs from racing with us. */ local_irq_save(flags); /* * Guard against NMI hits inside the critical section; * see also perf_prepare_sample_aux(). */ WRITE_ONCE(rb->aux_in_sampling, 1); barrier(); ret = event->pmu->snapshot_aux(event, handle, size); barrier(); WRITE_ONCE(rb->aux_in_sampling, 0); local_irq_restore(flags); return ret; } static void perf_aux_sample_output(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *data) { struct perf_event *sampler = event->aux_event; struct perf_buffer *rb; unsigned long pad; long size; if (WARN_ON_ONCE(!sampler || !data->aux_size)) return; rb = ring_buffer_get(sampler); if (!rb) return; size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); /* * An error here means that perf_output_copy() failed (returned a * non-zero surplus that it didn't copy), which in its current * enlightened implementation is not possible. If that changes, we'd * like to know. */ if (WARN_ON_ONCE(size < 0)) goto out_put; /* * The pad comes from ALIGN()ing data->aux_size up to u64 in * perf_prepare_sample_aux(), so should not be more than that. */ pad = data->aux_size - size; if (WARN_ON_ONCE(pad >= sizeof(u64))) pad = 8; if (pad) { u64 zero = 0; perf_output_copy(handle, &zero, pad); } out_put: ring_buffer_put(rb); } static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { u64 sample_type = event->attr.sample_type; data->type = sample_type; header->size += event->id_header_size; if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ data->tid_entry.pid = perf_event_pid(event, current); data->tid_entry.tid = perf_event_tid(event, current); } if (sample_type & PERF_SAMPLE_TIME) data->time = perf_event_clock(event); if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); if (sample_type & PERF_SAMPLE_STREAM_ID) data->stream_id = event->id; if (sample_type & PERF_SAMPLE_CPU) { data->cpu_entry.cpu = raw_smp_processor_id(); data->cpu_entry.reserved = 0; } } void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { if (event->attr.sample_id_all) __perf_event_header__init_id(header, data, event); } static void __perf_event__output_id_sample(struct perf_output_handle *handle, struct perf_sample_data *data) { u64 sample_type = data->type; if (sample_type & PERF_SAMPLE_TID) perf_output_put(handle, data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) perf_output_put(handle, data->time); if (sample_type & PERF_SAMPLE_ID) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(handle, data->stream_id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); if (sample_type & PERF_SAMPLE_IDENTIFIER) perf_output_put(handle, data->id); } void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample) { if (event->attr.sample_id_all) __perf_event__output_id_sample(handle, sample); } static void perf_output_read_one(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) { u64 read_format = event->attr.read_format; u64 values[5]; int n = 0; values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { values[n++] = running + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&event->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } static void perf_output_read_group(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; unsigned long flags; u64 values[6]; int n = 0; /* * Disabling interrupts avoids all counter scheduling * (context switches, timer based rotation and IPIs). */ local_irq_save(flags); values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; if ((leader != event) && (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); __output_copy(handle, values, n * sizeof(u64)); for_each_sibling_event(sub, leader) { n = 0; if ((sub != event) && (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&sub->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } local_irq_restore(flags); } #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ PERF_FORMAT_TOTAL_TIME_RUNNING) /* * XXX PERF_SAMPLE_READ vs inherited events seems difficult. * * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we are called in * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); else perf_output_read_one(handle, event, enabled, running); } static inline bool perf_sample_save_hw_index(struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; } void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { u64 sample_type = data->type; perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_IP) perf_output_put(handle, data->ip); if (sample_type & PERF_SAMPLE_TID) perf_output_put(handle, data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) perf_output_put(handle, data->time); if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(handle, data->addr); if (sample_type & PERF_SAMPLE_ID) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(handle, data->stream_id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(handle, data->period); if (sample_type & PERF_SAMPLE_READ) perf_output_read(handle, event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; size += data->callchain->nr; size *= sizeof(u64); __output_copy(handle, data->callchain, size); } if (sample_type & PERF_SAMPLE_RAW) { struct perf_raw_record *raw = data->raw; if (raw) { struct perf_raw_frag *frag = &raw->frag; perf_output_put(handle, raw->size); do { if (frag->copy) { __output_custom(handle, frag->copy, frag->data, frag->size); } else { __output_copy(handle, frag->data, frag->size); } if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); if (frag->pad) __output_skip(handle, NULL, frag->pad); } else { struct { u32 size; u32 data; } raw = { .size = sizeof(u32), .data = 0, }; perf_output_put(handle, raw); } } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { if (data->br_stack) { size_t size; size = data->br_stack->nr * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); if (perf_sample_save_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { /* * we always store at least the value of nr */ u64 nr = 0; perf_output_put(handle, nr); } } if (sample_type & PERF_SAMPLE_REGS_USER) { u64 abi = data->regs_user.abi; /* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). */ perf_output_put(handle, abi); if (abi) { u64 mask = event->attr.sample_regs_user; perf_output_sample_regs(handle, data->regs_user.regs, mask); } } if (sample_type & PERF_SAMPLE_STACK_USER) { perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); } if (sample_type & PERF_SAMPLE_WEIGHT) perf_output_put(handle, data->weight); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) perf_output_put(handle, data->txn); if (sample_type & PERF_SAMPLE_REGS_INTR) { u64 abi = data->regs_intr.abi; /* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). */ perf_output_put(handle, abi); if (abi) { u64 mask = event->attr.sample_regs_intr; perf_output_sample_regs(handle, data->regs_intr.regs, mask); } } if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) perf_output_put(handle, data->cgroup); if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); if (data->aux_size) perf_aux_sample_output(event, handle, data); } if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; if (wakeup_events) { struct perf_buffer *rb = handle->rb; int events = local_inc_return(&rb->events); if (events >= wakeup_events) { local_sub(wakeup_events, &rb->events); local_inc(&rb->wakeup); } } } } static u64 perf_virt_to_phys(u64 virt) { u64 phys_addr = 0; if (!virt) return 0; if (virt >= TASK_SIZE) { /* If it's vmalloc()d memory, leave phys_addr as 0 */ if (virt_addr_valid((void *)(uintptr_t)virt) && !(virt >= VMALLOC_START && virt < VMALLOC_END)) phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); } else { /* * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { struct page *p; pagefault_disable(); if (get_user_page_fast_only(virt, 0, &p)) { phys_addr = page_to_phys(p) + virt % PAGE_SIZE; put_page(p); } pagefault_enable(); } } return phys_addr; } static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; bool user = !event->attr.exclude_callchain_user; /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; if (!kernel && !user) return &__empty_callchain; callchain = get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header) + event->header_size; header->misc = 0; header->misc |= perf_misc_flags(regs); __perf_event_header__init_id(header, data, event); if (sample_type & PERF_SAMPLE_IP) data->ip = perf_instruction_pointer(regs); if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) data->callchain = perf_callchain(event, regs); size += data->callchain->nr; header->size += size * sizeof(u64); } if (sample_type & PERF_SAMPLE_RAW) { struct perf_raw_record *raw = data->raw; int size; if (raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; do { sum += frag->size; if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); size = round_up(sum + sizeof(u32), sizeof(u64)); raw->size = size - sizeof(u32); frag->pad = raw->size - sum; } else { size = sizeof(u64); } header->size += size; } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ if (data->br_stack) { if (perf_sample_save_hw_index(event)) size += sizeof(u64); size += data->br_stack->nr * sizeof(struct perf_branch_entry); } header->size += size; } if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) perf_sample_regs_user(&data->regs_user, regs); if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); if (data->regs_user.regs) { u64 mask = event->attr.sample_regs_user; size += hweight64(mask) * sizeof(u64); } header->size += size; } if (sample_type & PERF_SAMPLE_STACK_USER) { /* * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added * in case new sample type is added, because we could eat * up the rest of the sample size. */ u16 stack_size = event->attr.sample_stack_user; u16 size = sizeof(u64); stack_size = perf_sample_ustack_size(stack_size, header->size, data->regs_user.regs); /* * If there is something to dump, add space for the dump * itself and for the field that tells the dynamic size, * which is how many have been actually dumped. */ if (stack_size) size += sizeof(u64) + stack_size; data->stack_user_size = stack_size; header->size += size; } if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); perf_sample_regs_intr(&data->regs_intr, regs); if (data->regs_intr.regs) { u64 mask = event->attr.sample_regs_intr; size += hweight64(mask) * sizeof(u64); } header->size += size; } if (sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); #ifdef CONFIG_CGROUP_PERF if (sample_type & PERF_SAMPLE_CGROUP) { struct cgroup *cgrp; /* protected by RCU */ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; data->cgroup = cgroup_id(cgrp); } #endif if (sample_type & PERF_SAMPLE_AUX) { u64 size; header->size += sizeof(u64); /* size */ /* * Given the 16bit nature of header::size, an AUX sample can * easily overflow it, what with all the preceding sample bits. * Make sure this doesn't happen by using up to U16_MAX bytes * per sample in total (rounded down to 8 byte boundary). */ size = min_t(size_t, U16_MAX - header->size, event->attr.aux_sample_size); size = rounddown(size, 8); size = perf_prepare_sample_aux(event, data, size); WARN_ON_ONCE(size + header->size > U16_MAX); header->size += size; } /* * If you're adding more sample types here, you likely need to do * something about the overflowing header::size, like repurpose the * lowest 3 bits of size, which should be always zero at the moment. * This raises a more important question, do we really need 512k sized * samples and why, so good argumentation is in order for whatever you * do here next. */ WARN_ON_ONCE(header->size & 7); } static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, int (*output_begin)(struct perf_output_handle *, struct perf_sample_data *, struct perf_event *, unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; int err; /* protect the callchain buffers */ rcu_read_lock(); perf_prepare_sample(&header, data, event, regs); err = output_begin(&handle, data, event, header.size); if (err) goto exit; perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); exit: rcu_read_unlock(); return err; } void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { __perf_event_output(event, data, regs, perf_output_begin_forward); } void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { __perf_event_output(event, data, regs, perf_output_begin_backward); } int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return __perf_event_output(event, data, regs, perf_output_begin); } /* * read event_id */ struct perf_read_event { struct perf_event_header header; u32 pid; u32 tid; }; static void perf_event_read_event(struct perf_event *event, struct task_struct *task) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_read_event read_event = { .header = { .type = PERF_RECORD_READ, .misc = 0, .size = sizeof(read_event) + event->read_size, }, .pid = perf_event_pid(event, task), .tid = perf_event_tid(event, task), }; int ret; perf_event_header__init_id(&read_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, read_event.header.size); if (ret) return; perf_output_put(&handle, read_event); perf_output_read(&handle, event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void perf_iterate_ctx(struct perf_event_context *ctx, perf_iterate_f output, void *data, bool all) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (!all) { if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue; } output(event, data); } } static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) { struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); struct perf_event *event; list_for_each_entry_rcu(event, &pel->list, sb_list) { /* * Skip events that are not fully formed yet; ensure that * if we observe event->ctx, both event and ctx will be * complete enough. See perf_install_in_context(). */ if (!smp_load_acquire(&event->ctx)) continue; if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue; output(event, data); } } /* * Iterate all events that need to receive side-band events. * * For new callers; ensure that account_pmu_sb_event() includes * your event, otherwise it might not get delivered. */ static void perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; int ctxn; rcu_read_lock(); preempt_disable(); /* * If we have task_ctx != NULL we only notify the task context itself. * The task_ctx is set only for EXIT events before releasing task * context. */ if (task_ctx) { perf_iterate_ctx(task_ctx, output, data, false); goto done; } perf_iterate_sb_cpu(output, data); for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) perf_iterate_ctx(ctx, output, data, false); } done: preempt_enable(); rcu_read_unlock(); } /* * Clear all file-based filters at exec, they'll have to be * re-instated when/if these objects are mmapped again. */ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; unsigned long flags; if (!has_addr_filter(event)) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { event->addr_filter_ranges[count].start = 0; event->addr_filter_ranges[count].size = 0; restart++; } count++; } if (restart) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) perf_event_stop(event, 1); } void perf_event_exec(void) { struct perf_event_context *ctx; int ctxn; rcu_read_lock(); for_each_task_context_nr(ctxn) { ctx = current->perf_event_ctxp[ctxn]; if (!ctx) continue; perf_event_enable_on_exec(ctxn); perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); } rcu_read_unlock(); } struct remote_output { struct perf_buffer *rb; int err; }; static void __perf_event_output_stop(struct perf_event *event, void *data) { struct perf_event *parent = event->parent; struct remote_output *ro = data; struct perf_buffer *rb = ro->rb; struct stop_event_data sd = { .event = event, }; if (!has_aux(event)) return; if (!parent) parent = event; /* * In case of inheritance, it will be the parent that links to the * ring-buffer, but it will be the child that's actually using it. * * We are using event::rb to determine if the event should be stopped, * however this may race with ring_buffer_attach() (through set_output), * which will make us skip the event that actually needs to be stopped. * So ring_buffer_attach() has to stop an aux event before re-assigning * its rb pointer. */ if (rcu_dereference(parent->rb) == rb) ro->err = __perf_event_stop(&sd); } static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; struct pmu *pmu = event->ctx->pmu; struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); struct remote_output ro = { .rb = event->rb, }; rcu_read_lock(); perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); if (cpuctx->task_ctx) perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, &ro, false); rcu_read_unlock(); return ro.err; } static void perf_pmu_output_stop(struct perf_event *event) { struct perf_event *iter; int err, cpu; restart: rcu_read_lock(); list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { /* * For per-CPU events, we need to make sure that neither they * nor their children are running; for cpu==-1 events it's * sufficient to stop the event itself if it's active, since * it can't have children. */ cpu = iter->cpu; if (cpu == -1) cpu = READ_ONCE(iter->oncpu); if (cpu == -1) continue; err = cpu_function_call(cpu, __perf_pmu_output_stop, event); if (err == -EAGAIN) { rcu_read_unlock(); goto restart; } } rcu_read_unlock(); } /* * task tracking -- fork/exit * * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task */ struct perf_task_event { struct task_struct *task; struct perf_event_context *task_ctx; struct { struct perf_event_header header; u32 pid; u32 ppid; u32 tid; u32 ptid; u64 time; } event_id; }; static int perf_event_task_match(struct perf_event *event) { return event->attr.comm || event->attr.mmap || event->attr.mmap2 || event->attr.mmap_data || event->attr.task; } static void perf_event_task_output(struct perf_event *event, void *data) { struct perf_task_event *task_event = data; struct perf_output_handle handle; struct perf_sample_data sample; struct task_struct *task = task_event->task; int ret, size = task_event->event_id.header.size; if (!perf_event_task_match(event)) return; perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, task_event->event_id.header.size); if (ret) goto out; task_event->event_id.pid = perf_event_pid(event, task); task_event->event_id.tid = perf_event_tid(event, task); if (task_event->event_id.header.type == PERF_RECORD_EXIT) { task_event->event_id.ppid = perf_event_pid(event, task->real_parent); task_event->event_id.ptid = perf_event_pid(event, task->real_parent); } else { /* PERF_RECORD_FORK */ task_event->event_id.ppid = perf_event_pid(event, current); task_event->event_id.ptid = perf_event_tid(event, current); } task_event->event_id.time = perf_event_clock(event); perf_output_put(&handle, task_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: task_event->event_id.header.size = size; } static void perf_event_task(struct task_struct *task, struct perf_event_context *task_ctx, int new) { struct perf_task_event task_event; if (!atomic_read(&nr_comm_events) && !atomic_read(&nr_mmap_events) && !atomic_read(&nr_task_events)) return; task_event = (struct perf_task_event){ .task = task, .task_ctx = task_ctx, .event_id = { .header = { .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, .misc = 0, .size = sizeof(task_event.event_id), }, /* .pid */ /* .ppid */ /* .tid */ /* .ptid */ /* .time */ }, }; perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); perf_event_namespaces(task); } /* * comm tracking */ struct perf_comm_event { struct task_struct *task; char *comm; int comm_size; struct { struct perf_event_header header; u32 pid; u32 tid; } event_id; }; static int perf_event_comm_match(struct perf_event *event) { return event->attr.comm; } static void perf_event_comm_output(struct perf_event *event, void *data) { struct perf_comm_event *comm_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = comm_event->event_id.header.size; int ret; if (!perf_event_comm_match(event)) return; perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, comm_event->event_id.header.size); if (ret) goto out; comm_event->event_id.pid = perf_event_pid(event, comm_event->task); comm_event->event_id.tid = perf_event_tid(event, comm_event->task); perf_output_put(&handle, comm_event->event_id); __output_copy(&handle, comm_event->comm, comm_event->comm_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: comm_event->event_id.header.size = size; } static void perf_event_comm_event(struct perf_comm_event *comm_event) { char comm[TASK_COMM_LEN]; unsigned int size; memset(comm, 0, sizeof(comm)); strlcpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } void perf_event_comm(struct task_struct *task, bool exec) { struct perf_comm_event comm_event; if (!atomic_read(&nr_comm_events)) return; comm_event = (struct perf_comm_event){ .task = task, /* .comm */ /* .comm_size */ .event_id = { .header = { .type = PERF_RECORD_COMM, .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, /* .size */ }, /* .pid */ /* .tid */ }, }; perf_event_comm_event(&comm_event); } /* * namespaces tracking */ struct perf_namespaces_event { struct task_struct *task; struct { struct perf_event_header header; u32 pid; u32 tid; u64 nr_namespaces; struct perf_ns_link_info link_info[NR_NAMESPACES]; } event_id; }; static int perf_event_namespaces_match(struct perf_event *event) { return event->attr.namespaces; } static void perf_event_namespaces_output(struct perf_event *event, void *data) { struct perf_namespaces_event *namespaces_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u16 header_size = namespaces_event->event_id.header.size; int ret; if (!perf_event_namespaces_match(event)) return; perf_event_header__init_id(&namespaces_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, namespaces_event->event_id.header.size); if (ret) goto out; namespaces_event->event_id.pid = perf_event_pid(event, namespaces_event->task); namespaces_event->event_id.tid = perf_event_tid(event, namespaces_event->task); perf_output_put(&handle, namespaces_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: namespaces_event->event_id.header.size = header_size; } static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct path ns_path; struct inode *ns_inode; int error; error = ns_get_path(&ns_path, task, ns_ops); if (!error) { ns_inode = ns_path.dentry->d_inode; ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); ns_link_info->ino = ns_inode->i_ino; path_put(&ns_path); } } void perf_event_namespaces(struct task_struct *task) { struct perf_namespaces_event namespaces_event; struct perf_ns_link_info *ns_link_info; if (!atomic_read(&nr_namespaces_events)) return; namespaces_event = (struct perf_namespaces_event){ .task = task, .event_id = { .header = { .type = PERF_RECORD_NAMESPACES, .misc = 0, .size = sizeof(namespaces_event.event_id), }, /* .pid */ /* .tid */ .nr_namespaces = NR_NAMESPACES, /* .link_info[NR_NAMESPACES] */ }, }; ns_link_info = namespaces_event.event_id.link_info; perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX], task, &mntns_operations); #ifdef CONFIG_USER_NS perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX], task, &userns_operations); #endif #ifdef CONFIG_NET_NS perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX], task, &netns_operations); #endif #ifdef CONFIG_UTS_NS perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX], task, &utsns_operations); #endif #ifdef CONFIG_IPC_NS perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX], task, &ipcns_operations); #endif #ifdef CONFIG_PID_NS perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX], task, &pidns_operations); #endif #ifdef CONFIG_CGROUPS perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX], task, &cgroupns_operations); #endif perf_iterate_sb(perf_event_namespaces_output, &namespaces_event, NULL); } /* * cgroup tracking */ #ifdef CONFIG_CGROUP_PERF struct perf_cgroup_event { char *path; int path_size; struct { struct perf_event_header header; u64 id; char path[]; } event_id; }; static int perf_event_cgroup_match(struct perf_event *event) { return event->attr.cgroup; } static void perf_event_cgroup_output(struct perf_event *event, void *data) { struct perf_cgroup_event *cgroup_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u16 header_size = cgroup_event->event_id.header.size; int ret; if (!perf_event_cgroup_match(event)) return; perf_event_header__init_id(&cgroup_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, cgroup_event->event_id.header.size); if (ret) goto out; perf_output_put(&handle, cgroup_event->event_id); __output_copy(&handle, cgroup_event->path, cgroup_event->path_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: cgroup_event->event_id.header.size = header_size; } static void perf_event_cgroup(struct cgroup *cgrp) { struct perf_cgroup_event cgroup_event; char path_enomem[16] = "//enomem"; char *pathname; size_t size; if (!atomic_read(&nr_cgroup_events)) return; cgroup_event = (struct perf_cgroup_event){ .event_id = { .header = { .type = PERF_RECORD_CGROUP, .misc = 0, .size = sizeof(cgroup_event.event_id), }, .id = cgroup_id(cgrp), }, }; pathname = kmalloc(PATH_MAX, GFP_KERNEL); if (pathname == NULL) { cgroup_event.path = path_enomem; } else { /* just to be sure to have enough space for alignment */ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64)); cgroup_event.path = pathname; } /* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace. */ size = strlen(cgroup_event.path) + 1; while (!IS_ALIGNED(size, sizeof(u64))) cgroup_event.path[size++] = '\0'; cgroup_event.event_id.header.size += size; cgroup_event.path_size = size; perf_iterate_sb(perf_event_cgroup_output, &cgroup_event, NULL); kfree(pathname); } #endif /* * mmap tracking */ struct perf_mmap_event { struct vm_area_struct *vma; const char *file_name; int file_size; int maj, min; u64 ino; u64 ino_generation; u32 prot, flags; struct { struct perf_event_header header; u32 pid; u32 tid; u64 start; u64 len; u64 pgoff; } event_id; }; static int perf_event_mmap_match(struct perf_event *event, void *data) { struct perf_mmap_event *mmap_event = data; struct vm_area_struct *vma = mmap_event->vma; int executable = vma->vm_flags & VM_EXEC; return (!executable && event->attr.mmap_data) || (executable && (event->attr.mmap || event->attr.mmap2)); } static void perf_event_mmap_output(struct perf_event *event, void *data) { struct perf_mmap_event *mmap_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; u32 type = mmap_event->event_id.header.type; int ret; if (!perf_event_mmap_match(event, data)) return; if (event->attr.mmap2) { mmap_event->event_id.header.type = PERF_RECORD_MMAP2; mmap_event->event_id.header.size += sizeof(mmap_event->maj); mmap_event->event_id.header.size += sizeof(mmap_event->min); mmap_event->event_id.header.size += sizeof(mmap_event->ino); mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); mmap_event->event_id.header.size += sizeof(mmap_event->prot); mmap_event->event_id.header.size += sizeof(mmap_event->flags); } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, mmap_event->event_id.header.size); if (ret) goto out; mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { perf_output_put(&handle, mmap_event->maj); perf_output_put(&handle, mmap_event->min); perf_output_put(&handle, mmap_event->ino); perf_output_put(&handle, mmap_event->ino_generation); perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } __output_copy(&handle, mmap_event->file_name, mmap_event->file_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: mmap_event->event_id.header.size = size; mmap_event->event_id.header.type = type; } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; int maj = 0, min = 0; u64 ino = 0, gen = 0; u32 prot = 0, flags = 0; unsigned int size; char tmp[16]; char *buf = NULL; char *name; if (vma->vm_flags & VM_READ) prot |= PROT_READ; if (vma->vm_flags & VM_WRITE) prot |= PROT_WRITE; if (vma->vm_flags & VM_EXEC) prot |= PROT_EXEC; if (vma->vm_flags & VM_MAYSHARE) flags = MAP_SHARED; else flags = MAP_PRIVATE; if (vma->vm_flags & VM_DENYWRITE) flags |= MAP_DENYWRITE; if (vma->vm_flags & VM_MAYEXEC) flags |= MAP_EXECUTABLE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; if (file) { struct inode *inode; dev_t dev; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) { name = "//enomem"; goto cpy_name; } /* * d_path() works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ name = file_path(file, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; } inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); goto got_name; } else { if (vma->vm_ops && vma->vm_ops->name) { name = (char *) vma->vm_ops->name(vma); if (name) goto cpy_name; } name = (char *)arch_vma_name(vma); if (name) goto cpy_name; if (vma->vm_start <= vma->vm_mm->start_brk && vma->vm_end >= vma->vm_mm->brk) { name = "[heap]"; goto cpy_name; } if (vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) { name = "[stack]"; goto cpy_name; } name = "//anon"; goto cpy_name; } cpy_name: strlcpy(tmp, name, sizeof(tmp)); name = tmp; got_name: /* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace. */ size = strlen(name)+1; while (!IS_ALIGNED(size, sizeof(u64))) name[size++] = '\0'; mmap_event->file_name = name; mmap_event->file_size = size; mmap_event->maj = maj; mmap_event->min = min; mmap_event->ino = ino; mmap_event->ino_generation = gen; mmap_event->prot = prot; mmap_event->flags = flags; if (!(vma->vm_flags & VM_EXEC)) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); kfree(buf); } /* * Check whether inode and address range match filter criteria. */ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { /* d_inode(NULL) won't be equal to any mapped user-space file */ if (!filter->path.dentry) return false; if (d_inode(filter->path.dentry) != file_inode(file)) return false; if (filter->offset > offset + size) return false; if (filter->offset + filter->size < offset) return false; return true; } static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, struct vm_area_struct *vma, struct perf_addr_filter_range *fr) { unsigned long vma_size = vma->vm_end - vma->vm_start; unsigned long off = vma->vm_pgoff << PAGE_SHIFT; struct file *file = vma->vm_file; if (!perf_addr_filter_match(filter, file, off, vma_size)) return false; if (filter->offset < off) { fr->start = vma->vm_start; fr->size = min(vma_size, filter->size - (off - filter->offset)); } else { fr->start = vma->vm_start + filter->offset - off; fr->size = min(vma->vm_end - fr->start, filter->size); } return true; } static void __perf_addr_filters_adjust(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct vm_area_struct *vma = data; struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; unsigned long flags; if (!has_addr_filter(event)) return; if (!vma->vm_file) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (perf_addr_filter_vma_adjust(filter, vma, &event->addr_filter_ranges[count])) restart++; count++; } if (restart) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) perf_event_stop(event, 1); } /* * Adjust all task's events' filters to the new vma */ static void perf_addr_filters_adjust(struct vm_area_struct *vma) { struct perf_event_context *ctx; int ctxn; /* * Data tracing isn't supported yet and as such there is no need * to keep track of anything that isn't related to executable code: */ if (!(vma->vm_flags & VM_EXEC)) return; rcu_read_lock(); for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (!ctx) continue; perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); } rcu_read_unlock(); } void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; if (!atomic_read(&nr_mmap_events)) return; mmap_event = (struct perf_mmap_event){ .vma = vma, /* .file_name */ /* .file_size */ .event_id = { .header = { .type = PERF_RECORD_MMAP, .misc = PERF_RECORD_MISC_USER, /* .size */ }, /* .pid */ /* .tid */ .start = vma->vm_start, .len = vma->vm_end - vma->vm_start, .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, }, /* .maj (attr_mmap2 only) */ /* .min (attr_mmap2 only) */ /* .ino (attr_mmap2 only) */ /* .ino_generation (attr_mmap2 only) */ /* .prot (attr_mmap2 only) */ /* .flags (attr_mmap2 only) */ }; perf_addr_filters_adjust(vma); perf_event_mmap_event(&mmap_event); } void perf_event_aux_event(struct perf_event *event, unsigned long head, unsigned long size, u64 flags) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u64 offset; u64 size; u64 flags; } rec = { .header = { .type = PERF_RECORD_AUX, .misc = 0, .size = sizeof(rec), }, .offset = head, .size = size, .flags = flags, }; int ret; perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * Lost/dropped samples logging */ void perf_log_lost_samples(struct perf_event *event, u64 lost) { struct perf_output_handle handle; struct perf_sample_data sample; int ret; struct { struct perf_event_header header; u64 lost; } lost_samples_event = { .header = { .type = PERF_RECORD_LOST_SAMPLES, .misc = 0, .size = sizeof(lost_samples_event), }, .lost = lost, }; perf_event_header__init_id(&lost_samples_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, lost_samples_event.header.size); if (ret) return; perf_output_put(&handle, lost_samples_event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * context_switch tracking */ struct perf_switch_event { struct task_struct *task; struct task_struct *next_prev; struct { struct perf_event_header header; u32 next_prev_pid; u32 next_prev_tid; } event_id; }; static int perf_event_switch_match(struct perf_event *event) { return event->attr.context_switch; } static void perf_event_switch_output(struct perf_event *event, void *data) { struct perf_switch_event *se = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_switch_match(event)) return; /* Only CPU-wide events are allowed to see next/prev pid/tid */ if (event->ctx->task) { se->event_id.header.type = PERF_RECORD_SWITCH; se->event_id.header.size = sizeof(se->event_id.header); } else { se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; se->event_id.header.size = sizeof(se->event_id); se->event_id.next_prev_pid = perf_event_pid(event, se->next_prev); se->event_id.next_prev_tid = perf_event_tid(event, se->next_prev); } perf_event_header__init_id(&se->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); if (ret) return; if (event->ctx->task) perf_output_put(&handle, se->event_id.header); else perf_output_put(&handle, se->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in) { struct perf_switch_event switch_event; /* N.B. caller checks nr_switch_events != 0 */ switch_event = (struct perf_switch_event){ .task = task, .next_prev = next_prev, .event_id = { .header = { /* .type */ .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, /* .size */ }, /* .next_prev_pid */ /* .next_prev_tid */ }, }; if (!sched_in && task->state == TASK_RUNNING) switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } /* * IRQ throttle logging */ static void perf_log_throttle(struct perf_event *event, int enable) { struct perf_output_handle handle; struct perf_sample_data sample; int ret; struct { struct perf_event_header header; u64 time; u64 id; u64 stream_id; } throttle_event = { .header = { .type = PERF_RECORD_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, .time = perf_event_clock(event), .id = primary_event_id(event), .stream_id = event->id, }; if (enable) throttle_event.header.type = PERF_RECORD_UNTHROTTLE; perf_event_header__init_id(&throttle_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, throttle_event.header.size); if (ret) return; perf_output_put(&handle, throttle_event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * ksymbol register/unregister tracking */ struct perf_ksymbol_event { const char *name; int name_len; struct { struct perf_event_header header; u64 addr; u32 len; u16 ksym_type; u16 flags; } event_id; }; static int perf_event_ksymbol_match(struct perf_event *event) { return event->attr.ksymbol; } static void perf_event_ksymbol_output(struct perf_event *event, void *data) { struct perf_ksymbol_event *ksymbol_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_ksymbol_match(event)) return; perf_event_header__init_id(&ksymbol_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, ksymbol_event->event_id.header.size); if (ret) return; perf_output_put(&handle, ksymbol_event->event_id); __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { struct perf_ksymbol_event ksymbol_event; char name[KSYM_NAME_LEN]; u16 flags = 0; int name_len; if (!atomic_read(&nr_ksymbol_events)) return; if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; strlcpy(name, sym, KSYM_NAME_LEN); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); if (unregister) flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; ksymbol_event = (struct perf_ksymbol_event){ .name = name, .name_len = name_len, .event_id = { .header = { .type = PERF_RECORD_KSYMBOL, .size = sizeof(ksymbol_event.event_id) + name_len, }, .addr = addr, .len = len, .ksym_type = ksym_type, .flags = flags, }, }; perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); return; err: WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); } /* * bpf program load/unload tracking */ struct perf_bpf_event { struct bpf_prog *prog; struct { struct perf_event_header header; u16 type; u16 flags; u32 id; u8 tag[BPF_TAG_SIZE]; } event_id; }; static int perf_event_bpf_match(struct perf_event *event) { return event->attr.bpf_event; } static void perf_event_bpf_output(struct perf_event *event, void *data) { struct perf_bpf_event *bpf_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_bpf_match(event)) return; perf_event_header__init_id(&bpf_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, bpf_event->event_id.header.size); if (ret) return; perf_output_put(&handle, bpf_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, enum perf_bpf_event_type type) { bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; int i; if (prog->aux->func_cnt == 0) { perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)prog->bpf_func, prog->jited_len, unregister, prog->aux->ksym.name); } else { for (i = 0; i < prog->aux->func_cnt; i++) { struct bpf_prog *subprog = prog->aux->func[i]; perf_event_ksymbol( PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)subprog->bpf_func, subprog->jited_len, unregister, subprog->aux->ksym.name); } } } void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { struct perf_bpf_event bpf_event; if (type <= PERF_BPF_EVENT_UNKNOWN || type >= PERF_BPF_EVENT_MAX) return; switch (type) { case PERF_BPF_EVENT_PROG_LOAD: case PERF_BPF_EVENT_PROG_UNLOAD: if (atomic_read(&nr_ksymbol_events)) perf_event_bpf_emit_ksymbols(prog, type); break; default: break; } if (!atomic_read(&nr_bpf_events)) return; bpf_event = (struct perf_bpf_event){ .prog = prog, .event_id = { .header = { .type = PERF_RECORD_BPF_EVENT, .size = sizeof(bpf_event.event_id), }, .type = type, .flags = flags, .id = prog->aux->id, }, }; BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; size_t pad; u16 old_len; u16 new_len; struct { struct perf_event_header header; u64 addr; } event_id; }; static int perf_event_text_poke_match(struct perf_event *event) { return event->attr.text_poke; } static void perf_event_text_poke_output(struct perf_event *event, void *data) { struct perf_text_poke_event *text_poke_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u64 padding = 0; int ret; if (!perf_event_text_poke_match(event)) return; perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, text_poke_event->event_id.header.size); if (ret) return; perf_output_put(&handle, text_poke_event->event_id); perf_output_put(&handle, text_poke_event->old_len); perf_output_put(&handle, text_poke_event->new_len); __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); if (text_poke_event->pad) __output_copy(&handle, &padding, text_poke_event->pad); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { struct perf_text_poke_event text_poke_event; size_t tot, pad; if (!atomic_read(&nr_text_poke_events)) return; tot = sizeof(text_poke_event.old_len) + old_len; tot += sizeof(text_poke_event.new_len) + new_len; pad = ALIGN(tot, sizeof(u64)) - tot; text_poke_event = (struct perf_text_poke_event){ .old_bytes = old_bytes, .new_bytes = new_bytes, .pad = pad, .old_len = old_len, .new_len = new_len, .event_id = { .header = { .type = PERF_RECORD_TEXT_POKE, .misc = PERF_RECORD_MISC_KERNEL, .size = sizeof(text_poke_event.event_id) + tot + pad, }, .addr = (unsigned long)addr, }, }; perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); } void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; } static void perf_log_itrace_start(struct perf_event *event) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u32 pid; u32 tid; } rec; int ret; if (event->parent) event = event->parent; if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || event->attach_state & PERF_ATTACH_ITRACE) return; rec.header.type = PERF_RECORD_ITRACE_START; rec.header.misc = 0; rec.header.size = sizeof(rec); rec.pid = perf_event_pid(event, current); rec.tid = perf_event_tid(event, current); perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } static int __perf_event_account_interrupt(struct perf_event *event, int throttle) { struct hw_perf_event *hwc = &event->hw; int ret = 0; u64 seq; seq = __this_cpu_read(perf_throttled_seq); if (seq != hwc->interrupts_seq) { hwc->interrupts_seq = seq; hwc->interrupts = 1; } else { hwc->interrupts++; if (unlikely(throttle && hwc->interrupts > max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); ret = 1; } } if (event->attr.freq) { u64 now = perf_clock(); s64 delta = now - hwc->freq_time_stamp; hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) perf_adjust_period(event, delta, hwc->last_period, true); } return ret; } int perf_event_account_interrupt(struct perf_event *event) { return __perf_event_account_interrupt(event, 1); } /* * Generic event overflow handling, sampling. */ static int __perf_event_overflow(struct perf_event *event, int throttle, struct perf_sample_data *data, struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; /* * Non-sampling counters might still use the PMI to fold short * hardware counters, ignore those. */ if (unlikely(!is_sampling_event(event))) return 0; ret = __perf_event_account_interrupt(event, throttle); /* * XXX event_limit might not quite work as expected on inherited * events */ event->pending_kill = POLL_IN; if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; perf_event_disable_inatomic(event); } READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; irq_work_queue(&event->pending); } return ret; } int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } /* * Generic software event infrastructure */ struct swevent_htable { struct swevent_hlist *swevent_hlist; struct mutex hlist_mutex; int hlist_refcount; /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); /* * We directly increment event->count and keep a second value in * event->hw.period_left to count intervals. This period event * is kept in the range [-sample_period, 0] so that we can use the * sign as trigger. */ u64 perf_swevent_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; u64 period = hwc->last_period; u64 nr, offset; s64 old, val; hwc->last_period = hwc->sample_period; again: old = val = local64_read(&hwc->period_left); if (val < 0) return 0; nr = div64_u64(period + val, period); offset = nr * period; val -= offset; if (local64_cmpxchg(&hwc->period_left, old, val) != old) goto again; return nr; } static void perf_swevent_overflow(struct perf_event *event, u64 overflow, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; int throttle = 0; if (!overflow) overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; for (; overflow; overflow--) { if (__perf_event_overflow(event, throttle, data, regs)) { /* * We inhibit the overflow from happening when * hwc->interrupts == MAX_INTERRUPTS. */ break; } throttle = 1; } } static void perf_swevent_event(struct perf_event *event, u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; local64_add(nr, &event->count); if (!regs) return; if (!is_sampling_event(event)) return; if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { data->period = nr; return perf_swevent_overflow(event, 1, data, regs); } else data->period = event->hw.last_period; if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, data, regs); if (local64_add_negative(nr, &hwc->period_left)) return; perf_swevent_overflow(event, 0, data, regs); } static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; if (regs) { if (event->attr.exclude_user && user_mode(regs)) return 1; if (event->attr.exclude_kernel && !user_mode(regs)) return 1; } return 0; } static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { if (event->attr.type != type) return 0; if (event->attr.config != event_id) return 0; if (perf_exclude_event(event, regs)) return 0; return 1; } static inline u64 swevent_hash(u64 type, u32 event_id) { u64 val = event_id | (type << 32); return hash_64(val, SWEVENT_HLIST_BITS); } static inline struct hlist_head * __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) { u64 hash = swevent_hash(type, event_id); return &hlist->heads[hash]; } /* For the read side: events when they trigger */ static inline struct hlist_head * find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) { struct swevent_hlist *hlist; hlist = rcu_dereference(swhash->swevent_hlist); if (!hlist) return NULL; return __find_swevent_head(hlist, type, event_id); } /* For the event head insertion and removal in the hlist */ static inline struct hlist_head * find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) { struct swevent_hlist *hlist; u32 event_id = event->attr.config; u64 type = event->attr.type; /* * Event scheduling is always serialized against hlist allocation * and release. Which makes the protected version suitable here. * The context lock guarantees that. */ hlist = rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&event->ctx->lock)); if (!hlist) return NULL; return __find_swevent_head(hlist, type, event_id); } static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct perf_event *event; struct hlist_head *head; rcu_read_lock(); head = find_swevent_head_rcu(swhash, type, event_id); if (!head) goto end; hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_event(event, nr, data, regs); } end: rcu_read_unlock(); } DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); int perf_swevent_get_recursion_context(void) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); return get_recursion_context(swhash->recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void perf_swevent_put_recursion_context(int rctx) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); put_recursion_context(swhash->recursion, rctx); } void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; if (WARN_ON_ONCE(!regs)) return; perf_sample_data_init(&data, addr, 0); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); } void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { int rctx; preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (unlikely(rctx < 0)) goto fail; ___perf_sw_event(event_id, nr, regs, addr); perf_swevent_put_recursion_context(rctx); fail: preempt_enable_notrace(); } static void perf_swevent_read(struct perf_event *event) { } static int perf_swevent_add(struct perf_event *event, int flags) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; if (is_sampling_event(event)) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); if (WARN_ON_ONCE(!head)) return -EINVAL; hlist_add_head_rcu(&event->hlist_entry, head); perf_event_update_userpage(event); return 0; } static void perf_swevent_del(struct perf_event *event, int flags) { hlist_del_rcu(&event->hlist_entry); } static void perf_swevent_start(struct perf_event *event, int flags) { event->hw.state = 0; } static void perf_swevent_stop(struct perf_event *event, int flags) { event->hw.state = PERF_HES_STOPPED; } /* Deref the hlist from the update side */ static inline struct swevent_hlist * swevent_hlist_deref(struct swevent_htable *swhash) { return rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&swhash->hlist_mutex)); } static void swevent_hlist_release(struct swevent_htable *swhash) { struct swevent_hlist *hlist = swevent_hlist_deref(swhash); if (!hlist) return; RCU_INIT_POINTER(swhash->swevent_hlist, NULL); kfree_rcu(hlist, rcu_head); } static void swevent_hlist_put_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); if (!--swhash->hlist_refcount) swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); } static void swevent_hlist_put(void) { int cpu; for_each_possible_cpu(cpu) swevent_hlist_put_cpu(cpu); } static int swevent_hlist_get_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; mutex_lock(&swhash->hlist_mutex); if (!swevent_hlist_deref(swhash) && cpumask_test_cpu(cpu, perf_online_mask)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); if (!hlist) { err = -ENOMEM; goto exit; } rcu_assign_pointer(swhash->swevent_hlist, hlist); } swhash->hlist_refcount++; exit: mutex_unlock(&swhash->hlist_mutex); return err; } static int swevent_hlist_get(void) { int err, cpu, failed_cpu; mutex_lock(&pmus_lock); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(cpu); if (err) { failed_cpu = cpu; goto fail; } } mutex_unlock(&pmus_lock); return 0; fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; swevent_hlist_put_cpu(cpu); } mutex_unlock(&pmus_lock); return err; } struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { u64 event_id = event->attr.config; WARN_ON(event->parent); static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(); } static int perf_swevent_init(struct perf_event *event) { u64 event_id = event->attr.config; if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: case PERF_COUNT_SW_TASK_CLOCK: return -ENOENT; default: break; } if (event_id >= PERF_COUNT_SW_MAX) return -ENOENT; if (!event->parent) { int err; err = swevent_hlist_get(); if (err) return err; static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } return 0; } static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .event_init = perf_swevent_init, .add = perf_swevent_add, .del = perf_swevent_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, }; #ifdef CONFIG_EVENT_TRACING static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { void *record = data->raw->frag.data; /* only top level events have filters set */ if (event->parent) event = event->parent; if (likely(!event->filter) || filter_match_preds(event->filter, record)) return 1; return 0; } static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 0; /* * If exclude_kernel, only trace user-space tracepoints (uprobes) */ if (event->attr.exclude_kernel && !user_mode(regs)) return 0; if (!perf_tp_filter_match(event, data)) return 0; return 1; } void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task) { if (bpf_prog_array_valid(call)) { *(struct pt_regs **)raw_data = regs; if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, rctx, task); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) { struct perf_sample_data data; struct perf_event *event; struct perf_raw_record raw = { .frag = { .size = entry_size, .data = record, }, }; perf_sample_data_init(&data, 0, 0); data.raw = &raw; perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); } /* * If we got specified a target task, also iterate its context and * deliver this event there too. */ if (task && task != current) { struct perf_event_context *ctx; struct trace_entry *entry = record; rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); if (!ctx) goto unlock; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->cpu != smp_processor_id()) continue; if (event->attr.type != PERF_TYPE_TRACEPOINT) continue; if (event->attr.config != entry->type) continue; if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); } unlock: rcu_read_unlock(); } perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_destroy(event); } static int perf_tp_event_init(struct perf_event *event) { int err; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; /* * no branch sampling for tracepoint events */ if (has_branch_stack(event)) return -EOPNOTSUPP; err = perf_trace_init(event); if (err) return err; event->destroy = tp_perf_event_destroy; return 0; } static struct pmu perf_tracepoint = { .task_ctx_nr = perf_sw_context, .event_init = perf_tp_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, }; #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe * The flags should match following PMU_FORMAT_ATTR(). * * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe * if not set, create kprobe/uprobe * * The following values specify a reference counter (or semaphore in the * terminology of tools like dtrace, systemtap, etc.) Userspace Statically * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. * * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left */ enum perf_probe_config { PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, }; PMU_FORMAT_ATTR(retprobe, "config:0"); #endif #ifdef CONFIG_KPROBE_EVENTS static struct attribute *kprobe_attrs[] = { &format_attr_retprobe.attr, NULL, }; static struct attribute_group kprobe_format_group = { .name = "format", .attrs = kprobe_attrs, }; static const struct attribute_group *kprobe_attr_groups[] = { &kprobe_format_group, NULL, }; static int perf_kprobe_event_init(struct perf_event *event); static struct pmu perf_kprobe = { .task_ctx_nr = perf_sw_context, .event_init = perf_kprobe_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, .attr_groups = kprobe_attr_groups, }; static int perf_kprobe_event_init(struct perf_event *event) { int err; bool is_retprobe; if (event->attr.type != perf_kprobe.type) return -ENOENT; if (!perfmon_capable()) return -EACCES; /* * no branch sampling for probe events */ if (has_branch_stack(event)) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; err = perf_kprobe_init(event, is_retprobe); if (err) return err; event->destroy = perf_kprobe_destroy; return 0; } #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); static struct attribute *uprobe_attrs[] = { &format_attr_retprobe.attr, &format_attr_ref_ctr_offset.attr, NULL, }; static struct attribute_group uprobe_format_group = { .name = "format", .attrs = uprobe_attrs, }; static const struct attribute_group *uprobe_attr_groups[] = { &uprobe_format_group, NULL, }; static int perf_uprobe_event_init(struct perf_event *event); static struct pmu perf_uprobe = { .task_ctx_nr = perf_sw_context, .event_init = perf_uprobe_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, .attr_groups = uprobe_attr_groups, }; static int perf_uprobe_event_init(struct perf_event *event) { int err; unsigned long ref_ctr_offset; bool is_retprobe; if (event->attr.type != perf_uprobe.type) return -ENOENT; if (!perfmon_capable()) return -EACCES; /* * no branch sampling for probe events */ if (has_branch_stack(event)) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); if (err) return err; event->destroy = perf_uprobe_destroy; return 0; } #endif /* CONFIG_UPROBE_EVENTS */ static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); #ifdef CONFIG_KPROBE_EVENTS perf_pmu_register(&perf_kprobe, "kprobe", -1); #endif #ifdef CONFIG_UPROBE_EVENTS perf_pmu_register(&perf_uprobe, "uprobe", -1); #endif } static void perf_event_free_filter(struct perf_event *event) { ftrace_profile_free_filter(event); } #ifdef CONFIG_BPF_SYSCALL static void bpf_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { struct bpf_perf_event_data_kern ctx = { .data = data, .event = event, }; int ret = 0; ctx.regs = perf_arch_bpf_user_pt_regs(regs); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); ret = BPF_PROG_RUN(event->prog, &ctx); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); if (!ret) return; event->orig_overflow_handler(event, data, regs); } static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) { struct bpf_prog *prog; if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ return -EINVAL; if (event->prog) return -EEXIST; prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); if (IS_ERR(prog)) return PTR_ERR(prog); if (event->attr.precise_ip && prog->call_get_stack && (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* * On perf_event with precise_ip, calling bpf_get_stack() * may trigger unwinder warnings and occasional crashes. * bpf_get_[stack|stackid] works around this issue by using * callchain attached to perf_sample_data. If the * perf_event does not full (kernel and user) callchain * attached to perf_sample_data, do not allow attaching BPF * program that calls bpf_get_[stack|stackid]. */ bpf_prog_put(prog); return -EPROTO; } event->prog = prog; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); return 0; } static void perf_event_free_bpf_handler(struct perf_event *event) { struct bpf_prog *prog = event->prog; if (!prog) return; WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); event->prog = NULL; bpf_prog_put(prog); } #else static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) { return -EOPNOTSUPP; } static void perf_event_free_bpf_handler(struct perf_event *event) { } #endif /* * returns true if the event is a tracepoint, or a kprobe/upprobe created * with perf_event_open() */ static inline bool perf_event_is_tracing(struct perf_event *event) { if (event->pmu == &perf_tracepoint) return true; #ifdef CONFIG_KPROBE_EVENTS if (event->pmu == &perf_kprobe) return true; #endif #ifdef CONFIG_UPROBE_EVENTS if (event->pmu == &perf_uprobe) return true; #endif return false; } static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; int ret; if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog_fd); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); if (!is_kprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; prog = bpf_prog_get(prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { /* valid fd, but invalid bpf program type */ bpf_prog_put(prog); return -EINVAL; } /* Kprobe override only works for kprobes, not uprobes. */ if (prog->kprobe_override && !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { bpf_prog_put(prog); return -EINVAL; } if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); if (prog->aux->max_ctx_offset > off) { bpf_prog_put(prog); return -EACCES; } } ret = perf_event_attach_bpf_prog(event, prog); if (ret) bpf_prog_put(prog); return ret; } static void perf_event_free_bpf_prog(struct perf_event *event) { if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; } perf_event_detach_bpf_prog(event); } #else static inline void perf_tp_register(void) { } static void perf_event_free_filter(struct perf_event *event) { } static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { return -ENOENT; } static void perf_event_free_bpf_prog(struct perf_event *event) { } #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT void perf_bp_event(struct perf_event *bp, void *data) { struct perf_sample_data sample; struct pt_regs *regs = data; perf_sample_data_init(&sample, bp->attr.bp_addr, 0); if (!bp->hw.state && !perf_exclude_event(bp, regs)) perf_swevent_event(bp, 1, &sample, regs); } #endif /* * Allocate a new address filter */ static struct perf_addr_filter * perf_addr_filter_new(struct perf_event *event, struct list_head *filters) { int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); struct perf_addr_filter *filter; filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); if (!filter) return NULL; INIT_LIST_HEAD(&filter->entry); list_add_tail(&filter->entry, filters); return filter; } static void free_filters_list(struct list_head *filters) { struct perf_addr_filter *filter, *iter; list_for_each_entry_safe(filter, iter, filters, entry) { path_put(&filter->path); list_del(&filter->entry); kfree(filter); } } /* * Free existing address filters and optionally install new ones */ static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head) { unsigned long flags; LIST_HEAD(list); if (!has_addr_filter(event)) return; /* don't bother with children, they don't have their own filters */ if (event->parent) return; raw_spin_lock_irqsave(&event->addr_filters.lock, flags); list_splice_init(&event->addr_filters.list, &list); if (head) list_splice(head, &event->addr_filters.list); raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); free_filters_list(&list); } /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. * Called with mm::mmap_lock down for reading. */ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct mm_struct *mm, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!vma->vm_file) continue; if (perf_addr_filter_vma_adjust(filter, vma, fr)) return; } } /* * Update event's address range filters based on the * task's existing mappings, if any. */ static void perf_event_addr_filters_apply(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct task_struct *task = READ_ONCE(event->ctx->task); struct perf_addr_filter *filter; struct mm_struct *mm = NULL; unsigned int count = 0; unsigned long flags; /* * We may observe TASK_TOMBSTONE, which means that the event tear-down * will stop on the parent's child_mutex that our caller is also holding */ if (task == TASK_TOMBSTONE) return; if (ifh->nr_file_filters) { mm = get_task_mm(task); if (!mm) goto restart; mmap_read_lock(mm); } raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { /* * Adjust base offset if the filter is associated to a * binary that needs to be mapped: */ event->addr_filter_ranges[count].start = 0; event->addr_filter_ranges[count].size = 0; perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); } else { event->addr_filter_ranges[count].start = filter->offset; event->addr_filter_ranges[count].size = filter->size; } count++; } event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (ifh->nr_file_filters) { mmap_read_unlock(mm); mmput(mm); } restart: perf_event_stop(event, 1); } /* * Address range filtering: limiting the data to certain * instruction address ranges. Filters are ioctl()ed to us from * userspace as ascii strings. * * Filter string format: * * ACTION RANGE_SPEC * where ACTION is one of the * * "filter": limit the trace to this region * * "start": start tracing from this address * * "stop": stop tracing at this address/region; * RANGE_SPEC is * * for kernel addresses: <start address>[/<size>] * * for object files: <start address>[/<size>]@</path/to/object/file> * * if <size> is not specified or is zero, the range is treated as a single * address; not valid for ACTION=="filter". */ enum { IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, IF_SRC_FILE, IF_SRC_KERNEL, IF_SRC_FILEADDR, IF_SRC_KERNELADDR, }; enum { IF_STATE_ACTION = 0, IF_STATE_SOURCE, IF_STATE_END, }; static const match_table_t if_tokens = { { IF_ACT_FILTER, "filter" }, { IF_ACT_START, "start" }, { IF_ACT_STOP, "stop" }, { IF_SRC_FILE, "%u/%u@%s" }, { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, { IF_ACT_NONE, NULL }, }; /* * Address filter string parser */ static int perf_event_parse_addr_filter(struct perf_event *event, char *fstr, struct list_head *filters) { struct perf_addr_filter *filter = NULL; char *start, *orig, *filename = NULL; substring_t args[MAX_OPT_ARGS]; int state = IF_STATE_ACTION, token; unsigned int kernel = 0; int ret = -EINVAL; orig = fstr = kstrdup(fstr, GFP_KERNEL); if (!fstr) return -ENOMEM; while ((start = strsep(&fstr, " ,\n")) != NULL) { static const enum perf_addr_filter_action_t actions[] = { [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, }; ret = -EINVAL; if (!*start) continue; /* filter definition begins */ if (state == IF_STATE_ACTION) { filter = perf_addr_filter_new(event, filters); if (!filter) goto fail; } token = match_token(start, if_tokens, args); switch (token) { case IF_ACT_FILTER: case IF_ACT_START: case IF_ACT_STOP: if (state != IF_STATE_ACTION) goto fail; filter->action = actions[token]; state = IF_STATE_SOURCE; break; case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; fallthrough; case IF_SRC_FILEADDR: case IF_SRC_FILE: if (state != IF_STATE_SOURCE) goto fail; *args[0].to = 0; ret = kstrtoul(args[0].from, 0, &filter->offset); if (ret) goto fail; if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { *args[1].to = 0; ret = kstrtoul(args[1].from, 0, &filter->size); if (ret) goto fail; } if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { int fpos = token == IF_SRC_FILE ? 2 : 1; kfree(filename); filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; goto fail; } } state = IF_STATE_END; break; default: goto fail; } /* * Filter definition is fully parsed, validate and install it. * Make sure that it doesn't contradict itself or the event's * attribute. */ if (state == IF_STATE_END) { ret = -EINVAL; if (kernel && event->attr.exclude_kernel) goto fail; /* * ACTION "filter" must have a non-zero length region * specified. */ if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && !filter->size) goto fail; if (!kernel) { if (!filename) goto fail; /* * For now, we only support file-based filters * in per-task events; doing so for CPU-wide * events requires additional context switching * trickery, since same object code will be * mapped at different virtual addresses in * different processes. */ ret = -EOPNOTSUPP; if (!event->ctx->task) goto fail; /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &filter->path); if (ret) goto fail; ret = -EINVAL; if (!filter->path.dentry || !S_ISREG(d_inode(filter->path.dentry) ->i_mode)) goto fail; event->addr_filters.nr_file_filters++; } /* ready to consume more filters */ kfree(filename); filename = NULL; state = IF_STATE_ACTION; filter = NULL; kernel = 0; } } if (state != IF_STATE_ACTION) goto fail; kfree(filename); kfree(orig); return 0; fail: kfree(filename); free_filters_list(filters); kfree(orig); return ret; } static int perf_event_set_addr_filter(struct perf_event *event, char *filter_str) { LIST_HEAD(filters); int ret; /* * Since this is called in perf_ioctl() path, we're already holding * ctx::mutex. */ lockdep_assert_held(&event->ctx->mutex); if (WARN_ON_ONCE(event->parent)) return -EINVAL; ret = perf_event_parse_addr_filter(event, filter_str, &filters); if (ret) goto fail_clear_files; ret = event->pmu->addr_filters_validate(&filters); if (ret) goto fail_free_filters; /* remove existing filters, if any */ perf_addr_filters_splice(event, &filters); /* install new filters */ perf_event_for_each_child(event, perf_event_addr_filters_apply); return ret; fail_free_filters: free_filters_list(&filters); fail_clear_files: event->addr_filters.nr_file_filters = 0; return ret; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) { int ret = -EINVAL; char *filter_str; filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str); #ifdef CONFIG_EVENT_TRACING if (perf_event_is_tracing(event)) { struct perf_event_context *ctx = event->ctx; /* * Beware, here be dragons!! * * the tracepoint muck will deadlock against ctx->mutex, but * the tracepoint stuff does not actually need it. So * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we * already have a reference on ctx. * * This can result in event getting moved to a different ctx, * but that does not affect the tracepoint state. */ mutex_unlock(&ctx->mutex); ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); mutex_lock(&ctx->mutex); } else #endif if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); kfree(filter_str); return ret; } /* * hrtimer based swevent callback */ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) { enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_sample_data data; struct pt_regs *regs; struct perf_event *event; u64 period; event = container_of(hrtimer, struct perf_event, hw.hrtimer); if (event->state != PERF_EVENT_STATE_ACTIVE) return HRTIMER_NORESTART; event->pmu->read(event); perf_sample_data_init(&data, 0, event->hw.last_period); regs = get_irq_regs(); if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && is_idle_task(current))) if (__perf_event_overflow(event, 1, &data, regs)) ret = HRTIMER_NORESTART; } period = max_t(u64, 10000, event->hw.sample_period); hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; } static void perf_swevent_start_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 period; if (!is_sampling_event(event)) return; period = local64_read(&hwc->period_left); if (period) { if (period < 0) period = 10000; local64_set(&hwc->period_left, 0); } else { period = max_t(u64, 10000, hwc->sample_period); } hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED_HARD); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (is_sampling_event(event)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); hrtimer_cancel(&hwc->hrtimer); } } static void perf_swevent_init_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (!is_sampling_event(event)) return; hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hwc->hrtimer.function = perf_swevent_hrtimer; /* * Since hrtimers have a fixed rate, we can do a static freq->period * mapping and avoid the whole period adjust feedback stuff. */ if (event->attr.freq) { long freq = event->attr.sample_freq; event->attr.sample_period = NSEC_PER_SEC / freq; hwc->sample_period = event->attr.sample_period; local64_set(&hwc->period_left, hwc->sample_period); hwc->last_period = hwc->sample_period; event->attr.freq = 0; } } /* * Software event: cpu wall time clock */ static void cpu_clock_event_update(struct perf_event *event) { s64 prev; u64 now; now = local_clock(); prev = local64_xchg(&event->hw.prev_count, now); local64_add(now - prev, &event->count); } static void cpu_clock_event_start(struct perf_event *event, int flags) { local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); } static void cpu_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) cpu_clock_event_start(event, flags); perf_event_update_userpage(event); return 0; } static void cpu_clock_event_del(struct perf_event *event, int flags) { cpu_clock_event_stop(event, flags); } static void cpu_clock_event_read(struct perf_event *event) { cpu_clock_event_update(event); } static int cpu_clock_event_init(struct perf_event *event) { if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; perf_swevent_init_hrtimer(event); return 0; } static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, .del = cpu_clock_event_del, .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, }; /* * Software event: task time clock */ static void task_clock_event_update(struct perf_event *event, u64 now) { u64 prev; s64 delta; prev = local64_xchg(&event->hw.prev_count, now); delta = now - prev; local64_add(delta, &event->count); } static void task_clock_event_start(struct perf_event *event, int flags) { local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) task_clock_event_start(event, flags); perf_event_update_userpage(event); return 0; } static void task_clock_event_del(struct perf_event *event, int flags) { task_clock_event_stop(event, PERF_EF_UPDATE); } static void task_clock_event_read(struct perf_event *event) { u64 now = perf_clock(); u64 delta = now - event->ctx->timestamp; u64 time = event->ctx->time + delta; task_clock_event_update(event, time); } static int task_clock_event_init(struct perf_event *event) { if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; perf_swevent_init_hrtimer(event); return 0; } static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .event_init = task_clock_event_init, .add = task_clock_event_add, .del = task_clock_event_del, .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, }; static void perf_pmu_nop_void(struct pmu *pmu) { } static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) { } static int perf_pmu_nop_int(struct pmu *pmu) { return 0; } static int perf_event_nop_int(struct perf_event *event, u64 value) { return 0; } static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { __this_cpu_write(nop_txn_flags, flags); if (flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_disable(pmu); } static int perf_pmu_commit_txn(struct pmu *pmu) { unsigned int flags = __this_cpu_read(nop_txn_flags); __this_cpu_write(nop_txn_flags, 0); if (flags & ~PERF_PMU_TXN_ADD) return 0; perf_pmu_enable(pmu); return 0; } static void perf_pmu_cancel_txn(struct pmu *pmu) { unsigned int flags = __this_cpu_read(nop_txn_flags); __this_cpu_write(nop_txn_flags, 0); if (flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_enable(pmu); } static int perf_event_idx_default(struct perf_event *event) { return 0; } /* * Ensures all contexts with the same task_ctx_nr have the same * pmu_cpu_context too. */ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) { struct pmu *pmu; if (ctxn < 0) return NULL; list_for_each_entry(pmu, &pmus, entry) { if (pmu->task_ctx_nr == ctxn) return pmu->pmu_cpu_context; } return NULL; } static void free_pmu_context(struct pmu *pmu) { /* * Static contexts such as perf_sw_context have a global lifetime * and may be shared between different PMUs. Avoid freeing them * when a single PMU is going away. */ if (pmu->task_ctx_nr > perf_invalid_context) return; free_percpu(pmu->pmu_cpu_context); } /* * Let userspace know that this PMU supports address range filtering: */ static ssize_t nr_addr_filters_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); static struct idr pmu_idr; static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); static ssize_t perf_event_mux_interval_ms_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); static ssize_t perf_event_mux_interval_ms_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct pmu *pmu = dev_get_drvdata(dev); int timer, cpu, ret; ret = kstrtoint(buf, 0, &timer); if (ret) return ret; if (timer < 1) return -EINVAL; /* same value, noting to do */ if (timer == pmu->hrtimer_interval_ms) return count; mutex_lock(&mux_interval_mutex); pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpuctx); } cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); return count; } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, &dev_attr_nr_addr_filters.attr, NULL, }; static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); struct pmu *pmu = dev_get_drvdata(dev); if (n == 2 && !pmu->nr_addr_filters) return 0; return a->mode; } static struct attribute_group pmu_dev_attr_group = { .is_visible = pmu_dev_is_visible, .attrs = pmu_dev_attrs, }; static const struct attribute_group *pmu_dev_groups[] = { &pmu_dev_attr_group, NULL, }; static int pmu_bus_running; static struct bus_type pmu_bus = { .name = "event_source", .dev_groups = pmu_dev_groups, }; static void pmu_dev_release(struct device *dev) { kfree(dev); } static int pmu_dev_alloc(struct pmu *pmu) { int ret = -ENOMEM; pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!pmu->dev) goto out; pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); dev_set_drvdata(pmu->dev, pmu); pmu->dev->bus = &pmu_bus; pmu->dev->release = pmu_dev_release; ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) goto free_dev; ret = device_add(pmu->dev); if (ret) goto free_dev; if (pmu->attr_update) { ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); if (ret) goto del_dev; } out: return ret; del_dev: device_del(pmu->dev); free_dev: put_device(pmu->dev); goto out; } static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; int perf_pmu_register(struct pmu *pmu, const char *name, int type) { int cpu, ret, max = PERF_TYPE_MAX; mutex_lock(&pmus_lock); ret = -ENOMEM; pmu->pmu_disable_count = alloc_percpu(int); if (!pmu->pmu_disable_count) goto unlock; pmu->type = -1; if (!name) goto skip_type; pmu->name = name; if (type != PERF_TYPE_SOFTWARE) { if (type >= 0) max = type; ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); if (ret < 0) goto free_pdc; WARN_ON(type >= 0 && ret != type); type = ret; } pmu->type = type; if (pmu_bus_running) { ret = pmu_dev_alloc(pmu); if (ret) goto free_idr; } skip_type: if (pmu->task_ctx_nr == perf_hw_context) { static int hw_context_taken = 0; /* * Other than systems with heterogeneous CPUs, it never makes * sense for two PMUs to share perf_hw_context. PMUs which are * uncore must use perf_invalid_context. */ if (WARN_ON_ONCE(hw_context_taken && !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) pmu->task_ctx_nr = perf_invalid_context; hw_context_taken = 1; } pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) goto got_cpu_context; ret = -ENOMEM; pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) goto free_dev; for_each_possible_cpu(cpu) { struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.pmu = pmu; cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); __perf_mux_hrtimer_init(cpuctx, cpu); cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); cpuctx->heap = cpuctx->heap_default; } got_cpu_context: if (!pmu->start_txn) { if (pmu->pmu_enable) { /* * If we have pmu_enable/pmu_disable calls, install * transaction stubs that use that to try and batch * hardware accesses. */ pmu->start_txn = perf_pmu_start_txn; pmu->commit_txn = perf_pmu_commit_txn; pmu->cancel_txn = perf_pmu_cancel_txn; } else { pmu->start_txn = perf_pmu_nop_txn; pmu->commit_txn = perf_pmu_nop_int; pmu->cancel_txn = perf_pmu_nop_void; } } if (!pmu->pmu_enable) { pmu->pmu_enable = perf_pmu_nop_void; pmu->pmu_disable = perf_pmu_nop_void; } if (!pmu->check_period) pmu->check_period = perf_event_nop_int; if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; /* * Ensure the TYPE_SOFTWARE PMUs are at the head of the list, * since these cannot be in the IDR. This way the linear search * is fast, provided a valid software event is provided. */ if (type == PERF_TYPE_SOFTWARE || !name) list_add_rcu(&pmu->entry, &pmus); else list_add_tail_rcu(&pmu->entry, &pmus); atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; free_dev: device_del(pmu->dev); put_device(pmu->dev); free_idr: if (pmu->type != PERF_TYPE_SOFTWARE) idr_remove(&pmu_idr, pmu->type); free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; } EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { mutex_lock(&pmus_lock); list_del_rcu(&pmu->entry); /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); free_percpu(pmu->pmu_disable_count); if (pmu->type != PERF_TYPE_SOFTWARE) idr_remove(&pmu_idr, pmu->type); if (pmu_bus_running) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); static inline bool has_extended_regs(struct perf_event *event) { return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); } static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { struct perf_event_context *ctx = NULL; int ret; if (!try_module_get(pmu->module)) return -ENODEV; /* * A number of pmu->event_init() methods iterate the sibling_list to, * for example, validate if the group fits on the PMU. Therefore, * if this is a sibling event, acquire the ctx->mutex to protect * the sibling_list. */ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. */ ctx = perf_event_ctx_lock_nested(event->group_leader, SINGLE_DEPTH_NESTING); BUG_ON(!ctx); } event->pmu = pmu; ret = pmu->event_init(event); if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); if (!ret) { if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && has_extended_regs(event)) ret = -EOPNOTSUPP; if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && event_has_any_exclude_flag(event)) ret = -EINVAL; if (ret && event->destroy) event->destroy(event); } if (ret) module_put(pmu->module); return ret; } static struct pmu *perf_init_event(struct perf_event *event) { int idx, type, ret; struct pmu *pmu; idx = srcu_read_lock(&pmus_srcu); /* Try parent's PMU first: */ if (event->parent && event->parent->pmu) { pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; } /* * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE * are often aliases for PERF_TYPE_RAW. */ type = event->attr.type; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) type = PERF_TYPE_RAW; again: rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type) { type = event->attr.type; goto again; } if (ret) pmu = ERR_PTR(ret); goto unlock; } list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; if (ret != -ENOENT) { pmu = ERR_PTR(ret); goto unlock; } } pmu = ERR_PTR(-ENOENT); unlock: srcu_read_unlock(&pmus_srcu, idx); return pmu; } static void attach_sb_event(struct perf_event *event) { struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); raw_spin_lock(&pel->lock); list_add_rcu(&event->sb_list, &pel->list); raw_spin_unlock(&pel->lock); } /* * We keep a list of all !task (and therefore per-cpu) events * that need to receive side-band records. * * This avoids having to scan all the various PMU per-cpu contexts * looking for them. */ static void account_pmu_sb_event(struct perf_event *event) { if (is_sb_event(event)) attach_sb_event(event); } static void account_event_cpu(struct perf_event *event, int cpu) { if (event->parent) return; if (is_cgroup_event(event)) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); } /* Freq events need the tick to stay alive (see perf_event_task_tick). */ static void account_freq_event_nohz(void) { #ifdef CONFIG_NO_HZ_FULL /* Lock so we don't race with concurrent unaccount */ spin_lock(&nr_freq_lock); if (atomic_inc_return(&nr_freq_events) == 1) tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); spin_unlock(&nr_freq_lock); #endif } static void account_freq_event(void) { if (tick_nohz_full_enabled()) account_freq_event_nohz(); else atomic_inc(&nr_freq_events); } static void account_event(struct perf_event *event) { bool inc = false; if (event->parent) return; if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) atomic_inc(&nr_namespaces_events); if (event->attr.cgroup) atomic_inc(&nr_cgroup_events); if (event->attr.task) atomic_inc(&nr_task_events); if (event->attr.freq) account_freq_event(); if (event->attr.context_switch) { atomic_inc(&nr_switch_events); inc = true; } if (has_branch_stack(event)) inc = true; if (is_cgroup_event(event)) inc = true; if (event->attr.ksymbol) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); if (event->attr.text_poke) atomic_inc(&nr_text_poke_events); if (inc) { /* * We need the mutex here because static_branch_enable() * must complete *before* the perf_sched_count increment * becomes visible. */ if (atomic_inc_not_zero(&perf_sched_count)) goto enabled; mutex_lock(&perf_sched_mutex); if (!atomic_read(&perf_sched_count)) { static_branch_enable(&perf_sched_events); /* * Guarantee that all CPUs observe they key change and * call the perf scheduling hooks before proceeding to * install events that need them. */ synchronize_rcu(); } /* * Now that we have waited for the sync_sched(), allow further * increments to by-pass the mutex. */ atomic_inc(&perf_sched_count); mutex_unlock(&perf_sched_mutex); } enabled: account_event_cpu(event, event->cpu); account_pmu_sb_event(event); } /* * Allocate and initialize an event structure */ static struct perf_event * perf_event_alloc(struct perf_event_attr *attr, int cpu, struct task_struct *task, struct perf_event *group_leader, struct perf_event *parent_event, perf_overflow_handler_t overflow_handler, void *context, int cgroup_fd) { struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) return ERR_PTR(-EINVAL); } event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return ERR_PTR(-ENOMEM); /* * Single events are their own group leaders, with an * empty sibling list: */ if (!group_leader) group_leader = event; mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->active_list); init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); init_waitqueue_head(&event->waitq); event->pending_disable = -1; init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; event->pmu = NULL; event->oncpu = -1; event->parent = parent_event; event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; if (task) { event->attach_state = PERF_ATTACH_TASK; /* * XXX pmu::event_init needs to know what task to account to * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ event->hw.target = get_task_struct(task); } event->clock = &local_clock; if (parent_event) event->clock = parent_event->clock; if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (overflow_handler == bpf_overflow_handler) { struct bpf_prog *prog = parent_event->prog; bpf_prog_inc(prog); event->prog = prog; event->orig_overflow_handler = parent_event->orig_overflow_handler; } #endif } if (overflow_handler) { event->overflow_handler = overflow_handler; event->overflow_handler_context = context; } else if (is_write_backward(event)){ event->overflow_handler = perf_event_output_backward; event->overflow_handler_context = NULL; } else { event->overflow_handler = perf_event_output_forward; event->overflow_handler_context = NULL; } perf_event__state_init(event); pmu = NULL; hwc = &event->hw; hwc->sample_period = attr->sample_period; if (attr->freq && attr->sample_freq) hwc->sample_period = 1; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); /* * We currently do not support PERF_SAMPLE_READ on inherited events. * See perf_output_read(). */ if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) goto err_ns; if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; pmu = perf_init_event(event); if (IS_ERR(pmu)) { err = PTR_ERR(pmu); goto err_ns; } /* * Disallow uncore-cgroup events, they don't make sense as the cgroup will * be different on other CPUs in the uncore mask. */ if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { err = -EINVAL; goto err_pmu; } if (event->attr.aux_output && !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { err = -EOPNOTSUPP; goto err_pmu; } if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) goto err_pmu; } err = exclusive_event_init(event); if (err) goto err_pmu; if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range), GFP_KERNEL); if (!event->addr_filter_ranges) { err = -ENOMEM; goto err_per_task; } /* * Clone the parent's vma offsets: they are valid until exec() * even if the mm is not shared with the parent. */ if (event->parent) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); raw_spin_lock_irq(&ifh->lock); memcpy(event->addr_filter_ranges, event->parent->addr_filter_ranges, pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); raw_spin_unlock_irq(&ifh->lock); } /* force hw sync on the address filters */ event->addr_filters_gen = 1; } if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) goto err_addr_filters; } } err = security_perf_event_alloc(event); if (err) goto err_callchain_buffer; /* symmetric to unaccount_event() in _free_event() */ account_event(event); return event; err_callchain_buffer: if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); } err_addr_filters: kfree(event->addr_filter_ranges); err_per_task: exclusive_event_destroy(event); err_pmu: if (is_cgroup_event(event)) perf_detach_cgroup(event); if (event->destroy) event->destroy(event); module_put(pmu->module); err_ns: if (event->ns) put_pid_ns(event->ns); if (event->hw.target) put_task_struct(event->hw.target); kfree(event); return ERR_PTR(err); } static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr) { u32 size; int ret; /* Zero the full structure, so that a short copy will be nice. */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; /* ABI compatibility quirk: */ if (!size) size = PERF_ATTR_SIZE_VER0; if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); if (ret) { if (ret == -E2BIG) goto err_size; return ret; } attr->size = size; if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) return -EINVAL; if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL; if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { u64 mask = attr->branch_sample_type; /* only using defined bits */ if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) return -EINVAL; /* at least one branch bit must be set */ if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) return -EINVAL; /* propagate priv level, when not set for branch */ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { /* exclude_kernel checked on syscall entry */ if (!attr->exclude_kernel) mask |= PERF_SAMPLE_BRANCH_KERNEL; if (!attr->exclude_user) mask |= PERF_SAMPLE_BRANCH_USER; if (!attr->exclude_hv) mask |= PERF_SAMPLE_BRANCH_HV; /* * adjust user setting (for HW filter setup) */ attr->branch_sample_type = mask; } /* privileged levels capture (kernel, hv): check permissions */ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { ret = perf_allow_kernel(attr); if (ret) return ret; } } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { ret = perf_reg_validate(attr->sample_regs_user); if (ret) return ret; } if (attr->sample_type & PERF_SAMPLE_STACK_USER) { if (!arch_perf_have_user_stack_dump()) return -ENOSYS; /* * We have __u32 type for the size, but so far * we can only use __u16 as maximum due to the * __u16 sample size limit. */ if (attr->sample_stack_user >= USHRT_MAX) return -EINVAL; else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) return -EINVAL; } if (!attr->sample_max_stack) attr->sample_max_stack = sysctl_perf_event_max_stack; if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); #ifndef CONFIG_CGROUP_PERF if (attr->sample_type & PERF_SAMPLE_CGROUP) return -EINVAL; #endif out: return ret; err_size: put_user(sizeof(*attr), &uattr->size); ret = -E2BIG; goto out; } static void mutex_lock_double(struct mutex *a, struct mutex *b) { if (b < a) swap(a, b); mutex_lock(a); mutex_lock_nested(b, SINGLE_DEPTH_NESTING); } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { struct perf_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) { mutex_lock(&event->mmap_mutex); goto set; } /* don't allow circular references */ if (event == output_event) goto out; /* * Don't allow cross-cpu buffers */ if (output_event->cpu != event->cpu) goto out; /* * If its not a per-cpu rb, it must be the same task. */ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) goto out; /* * Mixing clocks in the same buffer is trouble you don't need. */ if (output_event->clock != event->clock) goto out; /* * Either writing ring buffer from beginning or from end. * Mixing is not allowed. */ if (is_write_backward(output_event) != is_write_backward(event)) goto out; /* * If both events generate aux data, they must be on the same PMU */ if (has_aux(event) && has_aux(output_event) && event->pmu != output_event->pmu) goto out; /* * Hold both mmap_mutex to serialize against perf_mmap_close(). Since * output_event is already on rb->event_list, and the list iteration * restarts after every removal, it is guaranteed this new event is * observed *OR* if output_event is already removed, it's guaranteed we * observe !rb->mmap_count. */ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ if (atomic_read(&event->mmap_count)) goto unlock; if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) goto unlock; /* did we race against perf_mmap_close() */ if (!atomic_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } } ring_buffer_attach(event, rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); if (output_event) mutex_unlock(&output_event->mmap_mutex); out: return ret; } static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) { bool nmi_safe = false; switch (clk_id) { case CLOCK_MONOTONIC: event->clock = &ktime_get_mono_fast_ns; nmi_safe = true; break; case CLOCK_MONOTONIC_RAW: event->clock = &ktime_get_raw_fast_ns; nmi_safe = true; break; case CLOCK_REALTIME: event->clock = &ktime_get_real_ns; break; case CLOCK_BOOTTIME: event->clock = &ktime_get_boottime_ns; break; case CLOCK_TAI: event->clock = &ktime_get_clocktai_ns; break; default: return -EINVAL; } if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) return -EINVAL; return 0; } /* * Variation on perf_event_ctx_lock_nested(), except we take two context * mutexes. */ static struct perf_event_context * __perf_event_ctx_lock_double(struct perf_event *group_leader, struct perf_event_context *ctx) { struct perf_event_context *gctx; again: rcu_read_lock(); gctx = READ_ONCE(group_leader->ctx); if (!refcount_inc_not_zero(&gctx->refcount)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); mutex_lock_double(&gctx->mutex, &ctx->mutex); if (group_leader->ctx != gctx) { mutex_unlock(&ctx->mutex); mutex_unlock(&gctx->mutex); put_ctx(gctx); goto again; } return gctx; } /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * * @attr_uptr: event_id type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader event fd */ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event *event, *sibling; struct perf_event_attr attr; struct perf_event_context *ctx, *gctx; struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; int move_group = 0; int err; int f_flags = O_RDWR; int cgroup_fd = -1; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) return -EINVAL; err = perf_copy_attr(attr_uptr, &attr); if (err) return err; /* Do we allow access to perf_event_open(2) ? */ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); if (err) return err; if (!attr.exclude_kernel) { err = perf_allow_kernel(&attr); if (err) return err; } if (attr.namespaces) { if (!perfmon_capable()) return -EACCES; } if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; } else { if (attr.sample_period & (1ULL << 63)) return -EINVAL; } /* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { err = perf_allow_kernel(&attr); if (err) return err; } /* REGS_INTR can leak data, lockdown must prevent this */ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { err = security_locked_down(LOCKDOWN_PERF); if (err) return err; } /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument * designates the cpu on which to monitor threads from that * cgroup. */ if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL; if (flags & PERF_FLAG_FD_CLOEXEC) f_flags |= O_CLOEXEC; event_fd = get_unused_fd_flags(f_flags); if (event_fd < 0) return event_fd; if (group_fd != -1) { err = perf_fget_light(group_fd, &group); if (err) goto err_fd; group_leader = group.file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) group_leader = NULL; } if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); goto err_group_fd; } } if (task && group_leader && group_leader->attr.inherit != attr.inherit) { err = -EINVAL; goto err_task; } if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_task; } if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { err = -EOPNOTSUPP; goto err_alloc; } } /* * Special case software events and allow them to be part of * any hardware group. */ pmu = event->pmu; if (attr.use_clockid) { err = perf_event_set_clock(event, attr.clockid); if (err) goto err_alloc; } if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; if (group_leader) { if (is_software_event(event) && !in_software_context(group_leader)) { /* * If the event is a sw event, but the group_leader * is on hw context. * * Allow the addition of software events to hw * groups, this is safe because software events * never fail to schedule. */ pmu = group_leader->ctx->pmu; } else if (!is_software_event(event) && is_software_event(group_leader) && (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to * the hardware context. */ move_group = 1; } } /* * Get the target context (task or percpu): */ ctx = find_get_context(pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; } /* * Look up the group leader (we will attach this event to it): */ if (group_leader) { err = -EINVAL; /* * Do not allow a recursive hierarchy (this new sibling * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) goto err_context; /* All events in a group should have the same clock */ if (group_leader->clock != event->clock) goto err_context; /* * Make sure we're both events for the same CPU; * grouping events for different CPUs is broken; since * you can never concurrently schedule them anyhow. */ if (group_leader->cpu != event->cpu) goto err_context; /* * Make sure we're both on the same task, or both * per-CPU events. */ if (group_leader->ctx->task != ctx->task) goto err_context; /* * Do not allow to attach to a group in a different task * or CPU context. If we're moving SW events, we'll fix * this up later, so allow that. * * Racy, not holding group_leader->ctx->mutex, see comment with * perf_event_ctx_lock(). */ if (!move_group && group_leader->ctx != ctx) goto err_context; /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) goto err_context; } if (output_event) { err = perf_event_set_output(event, output_event); if (err) goto err_context; } event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); event_file = NULL; goto err_context; } if (task) { err = down_read_interruptible(&task->signal->exec_update_lock); if (err) goto err_file; /* * Preserve ptrace permission check for backwards compatibility. * * We must hold exec_update_lock across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply). */ err = -EACCES; if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto err_cred; } if (move_group) { gctx = __perf_event_ctx_lock_double(group_leader, ctx); if (gctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_locked; } /* * Check if we raced against another sys_perf_event_open() call * moving the software group underneath us. */ if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * If someone moved the group out from under us, check * if this new event wound up on the same ctx, if so * its the regular !move_group case, otherwise fail. */ if (gctx != ctx) { err = -EINVAL; goto err_locked; } else { perf_event_ctx_unlock(group_leader, gctx); move_group = 0; goto not_move_group; } } /* * Failure to create exclusive events returns -EBUSY. */ err = -EBUSY; if (!exclusive_event_installable(group_leader, ctx)) goto err_locked; for_each_sibling_event(sibling, group_leader) { if (!exclusive_event_installable(sibling, ctx)) goto err_locked; } } else { mutex_lock(&ctx->mutex); /* * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx, * see the group_leader && !move_group test earlier. */ if (group_leader && group_leader->ctx != ctx) { err = -EINVAL; goto err_locked; } } not_move_group: if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_locked; } if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_locked; } if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu(). */ struct perf_cpu_context *cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) { err = -ENODEV; goto err_locked; } } if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { err = -EINVAL; goto err_locked; } /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. */ if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_locked; } WARN_ON_ONCE(ctx->parent_ctx); /* * This is the point on no return; we cannot fail hereafter. This is * where we start modifying current state. */ if (move_group) { /* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ perf_remove_from_context(group_leader, 0); put_ctx(gctx); for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); put_ctx(gctx); } /* * Wait for everybody to stop referencing the events through * the old lists, before installing it on new lists. */ synchronize_rcu(); /* * Install the group siblings before the group leader. * * Because a group leader will try and install the entire group * (through the sibling list, which is still in-tact), we can * end up with siblings installed in the wrong context. * * By installing siblings first we NO-OP because they're not * reachable through the group lists. */ for_each_sibling_event(sibling, group_leader) { perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); } /* * Removing from the context ends up with disabled * event. What we want here is event in the initial * startup state, ready to be add into new context. */ perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); } /* * Precalculate sample_data sizes; do while holding ctx::mutex such * that we're serialized against further additions and before * perf_install_in_context() which is the point the event is active and * can use these values. */ perf_event__header_size(event); perf_event__id_header_size(event); event->owner = current; perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); if (move_group) perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); if (task) { up_read(&task->signal->exec_update_lock); put_task_struct(task); } mutex_lock(&current->perf_event_mutex); list_add_tail(&event->owner_entry, &current->perf_event_list); mutex_unlock(&current->perf_event_mutex); /* * Drop the reference on the group_event after placing the * new event on the sibling_list. This ensures destruction * of the group leader will find the pointer to itself in * perf_group_detach(). */ fdput(group); fd_install(event_fd, event_file); return event_fd; err_locked: if (move_group) perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); err_cred: if (task) up_read(&task->signal->exec_update_lock); err_file: fput(event_file); err_context: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: /* * If event_file is set, the fput() above will have called ->release() * and that will take care of freeing the event. */ if (!event_file) free_event(event); err_task: if (task) put_task_struct(task); err_group_fd: fdput(group); err_fd: put_unused_fd(event_fd); return err; } /** * perf_event_create_kernel_counter * * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound * @task: task to profile (NULL for percpu) */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t overflow_handler, void *context) { struct perf_event_context *ctx; struct perf_event *event; int err; /* * Grouping is not supported for kernel events, neither is 'AUX', * make sure the caller's intentions are adjusted. */ if (attr->aux_output) return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; } /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; /* * Get the target context (task or percpu): */ ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_free; } WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_unlock; } if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu(). */ struct perf_cpu_context *cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) { err = -ENODEV; goto err_unlock; } } if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_unlock; } perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); return event; err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); err_free: free_event(event); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) { struct perf_event_context *src_ctx; struct perf_event_context *dst_ctx; struct perf_event *event, *tmp; LIST_HEAD(events); src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; /* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { perf_remove_from_context(event, 0); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); } /* * Wait for the events to quiesce before re-instating them. */ synchronize_rcu(); /* * Re-instate events in 2 passes. * * Skip over group leaders and only install siblings on this first * pass, siblings will not get enabled without a leader, however a * leader will enable its siblings, even if those are still on the old * context. */ list_for_each_entry_safe(event, tmp, &events, migrate_entry) { if (event->group_leader == event) continue; list_del(&event->migrate_entry); if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; account_event_cpu(event, dst_cpu); perf_install_in_context(dst_ctx, event, dst_cpu); get_ctx(dst_ctx); } /* * Once all the siblings are setup properly, install the group leaders * to make it go. */ list_for_each_entry_safe(event, tmp, &events, migrate_entry) { list_del(&event->migrate_entry); if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; account_event_cpu(event, dst_cpu); perf_install_in_context(dst_ctx, event, dst_cpu); get_ctx(dst_ctx); } mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { struct perf_event *parent_event = child_event->parent; u64 child_val; if (child_event->attr.inherit_stat) perf_event_read_event(child_event, child); child_val = perf_event_count(child_event); /* * Add back the child's count to the parent's count: */ atomic64_add(child_val, &parent_event->child_count); atomic64_add(child_event->total_time_enabled, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, &parent_event->child_total_time_running); } static void perf_event_exit_event(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { struct perf_event *parent_event = child_event->parent; /* * Do not destroy the 'original' grouping; because of the context * switch optimization the original events could've ended up in a * random child task. * * If we were to destroy the original group, all group related * operations would cease to function properly after this random * child dies. * * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ raw_spin_lock_irq(&child_ctx->lock); WARN_ON_ONCE(child_ctx->is_active); if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */ raw_spin_unlock_irq(&child_ctx->lock); /* * Parent events are governed by their filedesc, retain them. */ if (!parent_event) { perf_event_wakeup(child_event); return; } /* * Child events can be cleaned up. */ sync_child_event(child_event, child); /* * Remove this event from the parent's list */ WARN_ON_ONCE(parent_event->ctx->parent_ctx); mutex_lock(&parent_event->child_mutex); list_del_init(&child_event->child_list); mutex_unlock(&parent_event->child_mutex); /* * Kick perf_poll() for is_event_hup(). */ perf_event_wakeup(parent_event); free_event(child_event); put_event(parent_event); } static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; WARN_ON_ONCE(child != current); child_ctx = perf_pin_task_context(child, ctxn); if (!child_ctx) return; /* * In order to reduce the amount of tricky in ctx tear-down, we hold * ctx::mutex over the entire thing. This serializes against almost * everything that wants to access the ctx. * * The exception is sys_perf_event_open() / * perf_event_create_kernel_count() which does find_get_context() * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). */ mutex_lock(&child_ctx->mutex); /* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in. */ raw_spin_lock_irq(&child_ctx->lock); task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); put_ctx(child_ctx); /* cannot be last */ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); put_task_struct(current); /* cannot be last */ clone_ctx = unclone_ctx(child_ctx); raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); /* * Report the task dead after unscheduling the events so that we * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. */ perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) perf_event_exit_event(child_event, child_ctx, child); mutex_unlock(&child_ctx->mutex); put_ctx(child_ctx); } /* * When a child task exits, feed back event values to parent events. * * Can be called with exec_update_lock held when called from * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) { struct perf_event *event, *tmp; int ctxn; mutex_lock(&child->perf_event_mutex); list_for_each_entry_safe(event, tmp, &child->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); /* * Ensure the list deletion is visible before we clear * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ smp_store_release(&event->owner, NULL); } mutex_unlock(&child->perf_event_mutex); for_each_task_context_nr(ctxn) perf_event_exit_task_context(child, ctxn); /* * The perf_event_exit_task_context calls perf_event_task * with child's task_ctx, which generates EXIT events for * child contexts and sets child->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts. */ perf_event_task(child, NULL, 0); } static void perf_free_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *parent = event->parent; if (WARN_ON_ONCE(!parent)) return; mutex_lock(&parent->child_mutex); list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); put_event(parent); raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); free_event(event); } /* * Free a context as created by inheritance by perf_event_init_task() below, * used by fork() in case of fail. * * Even though the task has never lived, the context and events have been * exposed through the child_list, so we must take care tearing it all down. */ void perf_event_free_task(struct task_struct *task) { struct perf_event_context *ctx; struct perf_event *event, *tmp; int ctxn; for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (!ctx) continue; mutex_lock(&ctx->mutex); raw_spin_lock_irq(&ctx->lock); /* * Destroy the task <-> ctx relation and mark the context dead. * * This is important because even though the task hasn't been * exposed yet the context has been (through child_list). */ RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL); WRITE_ONCE(ctx->task, TASK_TOMBSTONE); put_task_struct(task); /* cannot be last */ raw_spin_unlock_irq(&ctx->lock); list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) perf_free_event(event, ctx); mutex_unlock(&ctx->mutex); /* * perf_event_release_kernel() could've stolen some of our * child events and still have them on its free_list. In that * case we must wait for these events to have been freed (in * particular all their references to this task must've been * dropped). * * Without this copy_process() will unconditionally free this * task (irrespective of its reference count) and * _free_event()'s put_task_struct(event->hw.target) will be a * use-after-free. * * Wait for all events to drop their context reference. */ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); put_ctx(ctx); /* must be last */ } } void perf_event_delayed_put(struct task_struct *task) { int ctxn; for_each_task_context_nr(ctxn) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } struct file *perf_event_get(unsigned int fd) { struct file *file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (file->f_op != &perf_fops) { fput(file); return ERR_PTR(-EBADF); } return file; } const struct perf_event *perf_get_event(struct file *file) { if (file->f_op != &perf_fops) return ERR_PTR(-EINVAL); return file->private_data; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { if (!event) return ERR_PTR(-EINVAL); return &event->attr; } /* * Inherit an event from parent task to child task. * * Returns: * - valid pointer on success * - NULL for orphaned events * - IS_ERR() on error */ static struct perf_event * inherit_event(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, struct perf_event *group_leader, struct perf_event_context *child_ctx) { enum perf_event_state parent_state = parent_event->state; struct perf_event *child_event; unsigned long flags; /* * Instead of creating recursive hierarchies of events, * we link inherited events back to the original parent, * which has a filp for sure, which we use as the reference * count: */ if (parent_event->parent) parent_event = parent_event->parent; child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, group_leader, parent_event, NULL, NULL, -1); if (IS_ERR(child_event)) return child_event; if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) && !child_ctx->task_ctx_data) { struct pmu *pmu = child_event->pmu; child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); if (!child_ctx->task_ctx_data) { free_event(child_event); return ERR_PTR(-ENOMEM); } } /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) * must be under the same lock in order to serialize against * perf_event_release_kernel(), such that either we must observe * is_orphaned_event() or they will observe us on the child_list. */ mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } get_ctx(child_ctx); /* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_event_{en, dis}able_family. */ if (parent_state >= PERF_EVENT_STATE_INACTIVE) child_event->state = PERF_EVENT_STATE_INACTIVE; else child_event->state = PERF_EVENT_STATE_OFF; if (parent_event->attr.freq) { u64 sample_period = parent_event->hw.sample_period; struct hw_perf_event *hwc = &child_event->hw; hwc->sample_period = sample_period; hwc->last_period = sample_period; local64_set(&hwc->period_left, sample_period); } child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; child_event->overflow_handler_context = parent_event->overflow_handler_context; /* * Precalculate sample_data sizes */ perf_event__header_size(child_event); perf_event__id_header_size(child_event); /* * Link it up in the child's context: */ raw_spin_lock_irqsave(&child_ctx->lock, flags); add_event_to_ctx(child_event, child_ctx); raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* * Link this into the parent event's child list */ list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); return child_event; } /* * Inherits an event group. * * This will quietly suppress orphaned events; !inherit_event() is not an error. * This matches with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error */ static int inherit_group(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, struct perf_event_context *child_ctx) { struct perf_event *leader; struct perf_event *sub; struct perf_event *child_ctr; leader = inherit_event(parent_event, parent, parent_ctx, child, NULL, child_ctx); if (IS_ERR(leader)) return PTR_ERR(leader); /* * @leader can be NULL here because of is_orphaned_event(). In this * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway. */ for_each_sibling_event(sub, parent_event) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); if (sub->aux_event == parent_event && child_ctr && !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } if (leader) leader->group_generation = parent_event->group_generation; return 0; } /* * Creates the child task context and tries to inherit the event-group. * * Clears @inherited_all on !attr.inherited or error. Note that we'll leave * inherited_all set when we 'fail' to inherit an orphaned event; this is * consistent with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error */ static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, int ctxn, int *inherited_all) { int ret; struct perf_event_context *child_ctx; if (!event->attr.inherit) { *inherited_all = 0; return 0; } child_ctx = child->perf_event_ctxp[ctxn]; if (!child_ctx) { /* * This is executed from the parent task context, so * inherit events that have been marked for cloning. * First allocate and initialize a context for the * child. */ child_ctx = alloc_perf_context(parent_ctx->pmu, child); if (!child_ctx) return -ENOMEM; child->perf_event_ctxp[ctxn] = child_ctx; } ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) *inherited_all = 0; return ret; } /* * Initialize the perf_event context in task_struct */ static int perf_event_init_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; unsigned long flags; int ret = 0; if (likely(!parent->perf_event_ctxp[ctxn])) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ parent_ctx = perf_pin_task_context(parent, ctxn); if (!parent_ctx) return 0; /* * No need to check if parent_ctx != NULL here; since we saw * it non-NULL earlier, the only reason for it to become NULL * is if we exit, and since we're currently in the middle of * a fork we can't be exiting at the same time. */ /* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. */ mutex_lock(&parent_ctx->mutex); /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) goto out_unlock; } /* * We can't hold ctx->lock when iterating the ->flexible_group list due * to allocations, but we need to prevent rotation because * rotate_ctx() will change the list from interrupt context. */ raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) goto out_unlock; } raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; child_ctx = child->perf_event_ctxp[ctxn]; if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. * * Note that if the parent is a clone, the holding of * parent_ctx->lock avoids it from being uncloned. */ cloned_ctx = parent_ctx->parent_ctx; if (cloned_ctx) { child_ctx->parent_ctx = cloned_ctx; child_ctx->parent_gen = parent_ctx->parent_gen; } else { child_ctx->parent_ctx = parent_ctx; child_ctx->parent_gen = parent_ctx->generation; } get_ctx(child_ctx->parent_ctx); } raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); out_unlock: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); put_ctx(parent_ctx); return ret; } /* * Initialize the perf_event context in task_struct */ int perf_event_init_task(struct task_struct *child) { int ctxn, ret; memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); for_each_task_context_nr(ctxn) { ret = perf_event_init_context(child, ctxn); if (ret) { perf_event_free_task(child); return ret; } } return 0; } static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); #ifdef CONFIG_CGROUP_PERF INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); #endif INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } static void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); WARN_ON(!hlist); rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { struct perf_event_context *ctx = __info; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *event; raw_spin_lock(&ctx->lock); ctx_sched_out(ctx, cpuctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; struct pmu *pmu; mutex_lock(&pmus_lock); list_for_each_entry(pmu, &pmus, entry) { cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); } cpumask_clear_cpu(cpu, perf_online_mask); mutex_unlock(&pmus_lock); } #else static void perf_event_exit_cpu_context(int cpu) { } #endif int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; struct pmu *pmu; perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); cpumask_set_cpu(cpu, perf_online_mask); list_for_each_entry(pmu, &pmus, entry) { cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); cpuctx->online = 1; mutex_unlock(&ctx->mutex); } mutex_unlock(&pmus_lock); return 0; } int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); return 0; } static int perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) { int cpu; for_each_online_cpu(cpu) perf_event_exit_cpu(cpu); return NOTIFY_OK; } /* * Run the perf reboot notifier at the very last possible moment so that * the generic watchdog code runs as long as possible. */ static struct notifier_block perf_reboot_notifier = { .notifier_call = perf_reboot, .priority = INT_MIN, }; void __init perf_event_init(void) { int ret; idr_init(&pmu_idr); perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); perf_pmu_register(&perf_cpu_clock, NULL, -1); perf_pmu_register(&perf_task_clock, NULL, -1); perf_tp_register(); perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. */ BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) != 1024); } ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); if (pmu_attr->event_str) return sprintf(page, "%s\n", pmu_attr->event_str); return 0; } EXPORT_SYMBOL_GPL(perf_event_sysfs_show); static int __init perf_event_sysfs_init(void) { struct pmu *pmu; int ret; mutex_lock(&pmus_lock); ret = bus_register(&pmu_bus); if (ret) goto unlock; list_for_each_entry(pmu, &pmus, entry) { if (!pmu->name || pmu->type < 0) continue; ret = pmu_dev_alloc(pmu); WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); } pmu_bus_running = 1; ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; } device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF static struct cgroup_subsys_state * perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct perf_cgroup *jc; jc = kzalloc(sizeof(*jc), GFP_KERNEL); if (!jc) return ERR_PTR(-ENOMEM); jc->info = alloc_percpu(struct perf_cgroup_info); if (!jc->info) { kfree(jc); return ERR_PTR(-ENOMEM); } return &jc->css; } static void perf_cgroup_css_free(struct cgroup_subsys_state *css) { struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); free_percpu(jc->info); kfree(jc); } static int perf_cgroup_css_online(struct cgroup_subsys_state *css) { perf_event_cgroup(css->cgroup); return 0; } static int __perf_cgroup_move(void *info) { struct task_struct *task = info; rcu_read_lock(); perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); rcu_read_unlock(); return 0; } static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .css_online = perf_cgroup_css_online, .attach = perf_cgroup_attach, /* * Implicitly enable on dfl hierarchy so that perf events can * always be filtered by cgroup2 path as long as perf_event * controller is not mounted on a legacy hierarchy. */ .implicit_on_dfl = true, .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 /* SPDX-License-Identifier: GPL-2.0 */ /* thread_info.h: common low-level thread information accessors * * Copyright (C) 2002 David Howells (dhowells@redhat.com) * - Incorporating suggestions made by Linus Torvalds */ #ifndef _LINUX_THREAD_INFO_H #define _LINUX_THREAD_INFO_H #include <linux/types.h> #include <linux/bug.h> #include <linux/restart_block.h> #include <linux/errno.h> #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels, * including <asm/current.h> can cause a circular dependency on some platforms. */ #include <asm/current.h> #define current_thread_info() ((struct thread_info *)current) #endif #include <linux/bitops.h> /* * For per-arch arch_within_stack_frames() implementations, defined in * asm/thread_info.h. */ enum { BAD_STACK = -1, NOT_STACK = 0, GOOD_FRAME, GOOD_STACK, }; #include <asm/thread_info.h> #ifdef __KERNEL__ #ifndef arch_set_restart_data #define arch_set_restart_data(restart) do { } while (0) #endif static inline long set_restart_fn(struct restart_block *restart, long (*fn)(struct restart_block *)) { restart->fn = fn; arch_set_restart_data(restart); return -ERESTART_RESTARTBLOCK; } #ifndef THREAD_ALIGN #define THREAD_ALIGN THREAD_SIZE #endif #define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions */ static inline void set_ti_thread_flag(struct thread_info *ti, int flag) { set_bit(flag, (unsigned long *)&ti->flags); } static inline void clear_ti_thread_flag(struct thread_info *ti, int flag) { clear_bit(flag, (unsigned long *)&ti->flags); } static inline void update_ti_thread_flag(struct thread_info *ti, int flag, bool value) { if (value) set_ti_thread_flag(ti, flag); else clear_ti_thread_flag(ti, flag); } static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_set_bit(flag, (unsigned long *)&ti->flags); } static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_clear_bit(flag, (unsigned long *)&ti->flags); } static inline int test_ti_thread_flag(struct thread_info *ti, int flag) { return test_bit(flag, (unsigned long *)&ti->flags); } #define set_thread_flag(flag) \ set_ti_thread_flag(current_thread_info(), flag) #define clear_thread_flag(flag) \ clear_ti_thread_flag(current_thread_info(), flag) #define update_thread_flag(flag, value) \ update_ti_thread_flag(current_thread_info(), flag, value) #define test_and_set_thread_flag(flag) \ test_and_set_ti_thread_flag(current_thread_info(), flag) #define test_and_clear_thread_flag(flag) \ test_and_clear_ti_thread_flag(current_thread_info(), flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) #define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, const void * const stackend, const void *obj, unsigned long len) { return 0; } #endif #ifdef CONFIG_HARDENED_USERCOPY extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); static __always_inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { if (!__builtin_constant_p(n)) __check_object_size(ptr, n, to_user); } #else static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { } #endif /* CONFIG_HARDENED_USERCOPY */ extern void __compiletime_error("copy source size is too small") __bad_copy_from(void); extern void __compiletime_error("copy destination size is too small") __bad_copy_to(void); static inline void copy_overflow(int size, unsigned long count) { WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __compiletime_object_size(addr); if (unlikely(sz >= 0 && sz < bytes)) { if (!__builtin_constant_p(bytes)) copy_overflow(sz, bytes); else if (is_source) __bad_copy_from(); else __bad_copy_to(); return false; } if (WARN_ON_ONCE(bytes > INT_MAX)) return false; check_object_size(addr, bytes, is_source); return true; } #ifndef arch_setup_new_exec static inline void arch_setup_new_exec(void) { } #endif #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */
9 2 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_PREEMPT_H #define __ASM_PREEMPT_H #include <asm/rmwcc.h> #include <asm/percpu.h> #include <linux/thread_info.h> DECLARE_PER_CPU(int, __preempt_count); /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 /* * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such * that a decrement hitting 0 means we can and should reschedule. */ #define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED) /* * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users * that think a non-zero value indicates we cannot preempt. */ static __always_inline int preempt_count(void) { return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) { int old, new; do { old = raw_cpu_read_4(__preempt_count); new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); } /* * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* * We fold the NEED_RESCHED bit into the preempt count such that * preempt_enable() can decrement and test for needing to reschedule with a * single instruction. * * We invert the actual bit, so that when the decrement hits 0 we know we both * need to resched (the bit is cleared) and can resched (no preempt count). */ static __always_inline void set_preempt_need_resched(void) { raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); } /* * The various preempt_count add/sub methods */ static __always_inline void __preempt_count_add(int val) { raw_cpu_add_4(__preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { raw_cpu_add_4(__preempt_count, -val); } /* * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule * a decrement which hits zero means we have no preempt_count and should * reschedule. */ static __always_inline bool __preempt_count_dec_and_test(void) { return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); } /* * Returns true when we need to resched and can (barring IRQ state). */ static __always_inline bool should_resched(int preempt_offset) { return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); } #ifdef CONFIG_PREEMPTION extern asmlinkage void preempt_schedule_thunk(void); # define __preempt_schedule() \ asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule(void); extern asmlinkage void preempt_schedule_notrace_thunk(void); # define __preempt_schedule_notrace() \ asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule_notrace(void); #endif #endif /* __ASM_PREEMPT_H */
prctl$PR_SET_MM_MAP(0x1c, 0xe, 0x0, 0x0)
stat(&(0x7f0000001cc0)='.\x00', &(0x7f0000001c00)={0x0, 0x0, 0x0, 0x0, <r0=>0x0})
setresuid(0xffffffffffffffff, r0, 0x0)
setfsuid(0x0)
stat(&(0x7f0000001cc0)='.\x00', &(0x7f0000001c00)={0x0, 0x0, 0x0, 0x0, <r0=>0x0})
setresuid(0xffffffffffffffff, r0, 0x0)
setfsuid(0x0)
r1 = syz_open_dev$tty20(0xc, 0x4, 0x0)
ioctl$VT_WAITACTIVE(r1, 0x5606)
perf_event_open(&(0x7f0000001d80)={0x2, 0x80, 0x93, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
setfsuid(0x0)
setresuid(0xee01, 0xffffffffffffffff, 0xee00)
setfsuid(0x0)
setresuid(0xee00, 0xee00, 0xee01)
setfsuid(0xee01)
stat(&(0x7f00000001c0)='.\x00', &(0x7f0000000000)={0x0, 0x0, 0x0, 0x0, <r0=>0x0})
setfsuid(r0)
stat(&(0x7f00000001c0)='.\x00', &(0x7f0000000000)={0x0, 0x0, 0x0, 0x0, <r1=>0x0})
setfsuid(r1)
stat(&(0x7f0000001cc0)='.\x00', &(0x7f0000001c00)={0x0, 0x0, 0x0, 0x0, <r0=>0x0})
setresuid(0x0, r0, 0x0)
setfsuid(0xee01)
setfsuid(0xffffffffffffffff)
perf_event_open(&(0x7f0000001d80)={0x2, 0x80, 0x94, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
setfsuid(0x0)
____fput---of 1
__alloc_file---of 8
__fput---of 34
__fput_sync---of 4
alloc_empty_file---of 7
alloc_empty_file.cold---of 1
alloc_empty_file_noaccount---of 3
alloc_file---of 12
alloc_file_clone---of 3
alloc_file_pseudo---of 6
delayed_fput---of 2
file_free_rcu---of 4
flush_delayed_fput---of 2
fput100%of 1
fput_many17%of 6
get_max_files---of 1
proc_nr_files---of 1
alloc_uid78%of 9
find_user---of 1
free_uid45%of 9
uid_hash_find.isra.017%of 12
__generic_block_fiemap---of 27
__ia32_compat_sys_ioctl---of 16
__ia32_sys_ioctl---of 11
__x64_sys_ioctl82%of 11
compat_ioctl_preallocate---of 7
compat_ptr_ioctl---of 3
do_vfs_ioctl5%of 63
do_vfs_ioctl.cold---of 1
fiemap_fill_next_extent---of 13
fiemap_prep---of 10
generic_block_fiemap---of 1
ioctl_file_clone---of 9
ioctl_preallocate---of 7
vfs_ioctl---of 5
get_user_session_keyring_rcu---of 4
install_process_keyring_to_cred---of 6
install_session_keyring_to_cred---of 11
install_thread_keyring_to_cred---of 6
join_session_keyring---of 17
key_change_session_keyring---of 24
key_fsgid_changed---of 2
key_fsuid_changed50%of 2
look_up_user_keyrings---of 33
lookup_user_key---of 96
lookup_user_key_possessed---of 1
search_cred_keyrings_rcu---of 16
search_process_keyrings_rcu---of 11
__do_sys_capget---of 34
__do_sys_capset---of 20
__ia32_sys_capget---of 1
__ia32_sys_capset---of 1
__x64_sys_capget---of 1
__x64_sys_capset---of 1
cap_validate_magic---of 11
capable60%of 5
capable.cold---of 1
capable_wrt_inode_uidgid---of 6
capable_wrt_inode_uidgid.cold---of 1
file_ns_capable---of 5
has_capability---of 1
has_capability_noaudit---of 1
has_ns_capability---of 18
has_ns_capability_noaudit---of 18
ns_capable---of 5
ns_capable.cold---of 1
ns_capable_noaudit---of 5
ns_capable_noaudit.cold---of 1
ns_capable_setid100%of 5
ns_capable_setid.cold---of 1
privileged_wrt_inode_uidgid---of 4
ptracer_capable---of 20
__put_user_ns---of 1
cmp_extents_forward---of 3
cmp_extents_reverse---of 3
cmp_map_id---of 8
create_user_ns---of 15
current_in_userns---of 3
free_user_ns---of 8
from_kgid---of 1
from_kgid_munged---of 3
from_kprojid---of 1
from_kprojid_munged---of 2
from_kuid---of 1
from_kuid_munged67%of 3
gid_m_show---of 3
gid_m_start---of 5
in_userns---of 3
m_next---of 1
make_kgid---of 1
make_kprojid---of 1
make_kuid100%of 1
map_id_range_down92%of 12
map_id_up80%of 10
map_write---of 98
map_write.cold---of 1
ns_get_owner---of 7
proc_gid_map_write---of 5
proc_projid_map_write---of 5
proc_setgroups_show---of 2
proc_setgroups_write---of 13
proc_uid_map_write---of 5
projid_m_show---of 3
projid_m_start---of 5
uid_m_show---of 3
uid_m_start---of 7
unshare_userns---of 8
userns_get---of 22
userns_install---of 13
userns_may_setgroups---of 3
userns_owner---of 1
userns_put---of 3
__find_next_entry---of 17
__ftrace_trace_stack---of 15
__remove_instance.part.0---of 13
__set_tracer_option.isra.0---of 5
__trace_array_vprintk.part.0---of 16
__trace_bputs---of 11
__trace_find_cmdline---of 11
__trace_find_cmdline.cold---of 1
__trace_puts---of 14
__trace_stack---of 4
alloc_percpu_trace_buffer.part.0---of 5
alloc_percpu_trace_buffer.part.0.cold---of 1
allocate_cmdlines_buffer---of 5
allocate_trace_buffer---of 7
buffer_percent_read---of 1
buffer_percent_write---of 5
buffer_pipe_buf_get---of 8
buffer_pipe_buf_release---of 1
buffer_ref_release---of 5
buffer_spd_release---of 1
call_filter_check_discard---of 6
clear_tracing_err_log---of 3
close_pipe_on_cpu---of 7
create_trace_option_files---of 25
create_trace_option_files.cold---of 2
disable_trace_buffered_event---of 1
disable_trace_on_warning---of 4
dummy_set_flag---of 1
enable_trace_buffered_event---of 1
err_pos---of 6
ftrace_dump---of 18
ftrace_dump.cold---of 12
ftrace_exports---of 5
ftrace_now---of 4
get_order---of 1
get_total_entries---of 6
init_tracer_tracefs---of 49
init_tracer_tracefs.cold---of 25
instance_mkdir---of 8
instance_rmdir---of 10
is_tracing_stopped---of 1
ns2usecs---of 1
nsecs_to_usecs---of 1
peek_next_entry---of 10
print_event_info---of 1
print_trace_header---of 3
print_trace_line---of 63
rb_simple_read---of 4
rb_simple_write---of 19
register_ftrace_export---of 11
s_next---of 10
s_show---of 15
s_start---of 22
s_stop---of 5
saved_cmdlines_next---of 7
saved_cmdlines_show---of 1
saved_cmdlines_start---of 13
saved_cmdlines_stop---of 1
saved_tgids_next---of 5
saved_tgids_show---of 3
saved_tgids_start---of 5
set_tracer_flag---of 35
show_traces_open---of 10
show_traces_release---of 5
t_next---of 8
t_show---of 6
t_start---of 15
t_stop---of 1
trace_array_create---of 13
trace_array_create_dir---of 9
trace_array_destroy---of 11
trace_array_find---of 5
trace_array_find_get---of 7
trace_array_get---of 6
trace_array_get_by_name---of 8
trace_array_init_printk---of 6
trace_array_printk---of 8
trace_array_printk_buf---of 6
trace_array_put---of 4
trace_array_vprintk---of 4
trace_automount---of 5
trace_buffer_lock_reserve---of 3
trace_buffer_unlock_commit_nostack---of 3
trace_buffer_unlock_commit_regs---of 20
trace_buffered_event_disable---of 9
trace_buffered_event_enable---of 14
trace_buffered_event_enable.cold---of 1
trace_clock_in_ns---of 1
trace_create_file---of 2
trace_create_file.cold---of 1
trace_default_header---of 13
trace_die_handler---of 5
trace_dump_stack---of 3
trace_empty---of 11
trace_event_buffer_commit---of 29
trace_event_buffer_lock_reserve---of 12
trace_event_format---of 18
trace_filter_add_remove_task---of 8
trace_find_cmdline---of 4
trace_find_filtered_pid---of 4
trace_find_next_entry---of 13
trace_find_next_entry_inc---of 6
trace_find_tgid---of 6
trace_free_pid_list---of 1
trace_function---of 12
trace_get_user---of 32
trace_handle_return---of 3
trace_ignore_this_task---of 7
trace_init_global_iter---of 8
trace_keep_overwrite---of 5
trace_latency_header---of 5
trace_module_notify---of 6
trace_options_core_read---of 2
trace_options_core_write---of 6
trace_options_init_dentry.part.0---of 7
trace_options_init_dentry.part.0.cold---of 1
trace_options_read---of 2
trace_options_write---of 8
trace_panic_handler---of 4
trace_parse_run_command---of 18
trace_parse_run_command.cold---of 1
trace_parser_get_init---of 3
trace_parser_put---of 1
trace_pid_next---of 3
trace_pid_show---of 1
trace_pid_start---of 7
trace_pid_write---of 23
trace_printk_init_buffers---of 3
trace_printk_init_buffers.cold---of 2
trace_printk_seq---of 4
trace_printk_seq.cold---of 1
trace_printk_start_comm---of 2
trace_run_command---of 5
trace_save_cmdline---of 14
trace_set_options---of 11
trace_total_entries---of 2
trace_total_entries_cpu---of 5
trace_vbprintk---of 21
trace_vprintk---of 5
tracepoint_printk_sysctl---of 7
tracer_init---of 1
tracer_tracing_is_on---of 5
tracer_tracing_off---of 3
tracer_tracing_on---of 3
tracing_alloc_snapshot---of 4
tracing_buffers_open---of 14
tracing_buffers_poll---of 5
tracing_buffers_read---of 26
tracing_buffers_release---of 5
tracing_buffers_splice_read---of 38
tracing_check_open_get_tr---of 11
tracing_clock_open---of 7
tracing_clock_show---of 7
tracing_clock_write---of 6
tracing_cond_snapshot_data---of 1
tracing_cpumask_read---of 5
tracing_cpumask_write---of 5
tracing_entries_read---of 13
tracing_entries_write---of 8
tracing_err_log_open---of 11
tracing_err_log_release---of 8
tracing_err_log_seq_next---of 1
tracing_err_log_seq_show---of 7
tracing_err_log_seq_start---of 1
tracing_err_log_seq_stop---of 1
tracing_err_log_write---of 1
tracing_free_buffer_release---of 9
tracing_free_buffer_write---of 1
tracing_generic_entry_update78%of 9
tracing_init_dentry---of 7
tracing_init_dentry.cold---of 1
tracing_is_disabled---of 1
tracing_is_enabled---of 1
tracing_is_on---of 5
tracing_iter_reset---of 10
tracing_log_err---of 7
tracing_lseek---of 5
tracing_mark_raw_write---of 15
tracing_mark_write---of 26
tracing_off---of 3
tracing_on---of 3
tracing_open---of 43
tracing_open_file_tr---of 9
tracing_open_generic---of 4
tracing_open_generic_tr---of 3
tracing_open_options---of 3
tracing_open_pipe---of 20
tracing_poll_pipe---of 5
tracing_read_pipe---of 26
tracing_readme_read---of 1
tracing_record_cmdline---of 3
tracing_record_taskinfo---of 13
tracing_record_taskinfo_sched_switch---of 20
tracing_record_tgid---of 7
tracing_release---of 20
tracing_release_file_tr---of 5
tracing_release_generic_tr---of 5
tracing_release_options---of 5
tracing_release_pipe---of 7
tracing_reset_all_online_cpus---of 1
tracing_reset_all_online_cpus_unlocked---of 7
tracing_reset_online_cpus---of 6
tracing_resize_ring_buffer---of 13
tracing_saved_cmdlines_open---of 4
tracing_saved_cmdlines_size_read---of 4
tracing_saved_cmdlines_size_write---of 9
tracing_saved_tgids_open---of 4
tracing_set_clock---of 5
tracing_set_cpumask---of 16
tracing_set_time_stamp_abs---of 7
tracing_set_trace_read---of 1
tracing_set_trace_write---of 8
tracing_set_tracer---of 25
tracing_set_tracer.cold---of 1
tracing_single_release_tr---of 5
tracing_snapshot---of 2
tracing_snapshot_alloc---of 2
tracing_snapshot_cond---of 2
tracing_snapshot_cond_disable---of 1
tracing_snapshot_cond_enable---of 1
tracing_spd_release_pipe---of 1
tracing_splice_read_pipe---of 39
tracing_start---of 2
tracing_start_tr.part.0---of 10
tracing_stats_read---of 8
tracing_stop---of 1
tracing_stop_tr---of 8
tracing_thresh_read---of 5
tracing_thresh_write---of 8
tracing_time_stamp_mode_open---of 7
tracing_time_stamp_mode_show---of 4
tracing_total_entries_read---of 8
tracing_trace_options_open---of 7
tracing_trace_options_show---of 11
tracing_trace_options_write---of 6
tracing_update_buffers---of 9
tracing_wait_pipe---of 12
tracing_write_stub---of 1
unregister_ftrace_export---of 13
__alloc_fd---of 23
__alloc_fd.cold---of 1
__close_fd---of 3
__close_fd_get_file---of 11
__close_range---of 33
__f_unlock_pos---of 1
__fd_install---of 34
__fdget100%of 1
__fdget_pos---of 7
__fdget_raw---of 1
__fget_files63%of 24
__fget_light40%of 10
__get_unused_fd_flags---of 1
__ia32_sys_dup---of 5
__ia32_sys_dup2---of 26
__ia32_sys_dup3---of 1
__put_unused_fd---of 8
__receive_fd---of 13
__x64_sys_dup---of 5
__x64_sys_dup2---of 26
__x64_sys_dup3---of 1
alloc_fdtable---of 9
close_fd_get_file---of 1
copy_fd_bitmaps---of 1
do_close_on_exec---of 16
do_dup2---of 19
dup_fd---of 39
exit_files---of 2
expand_files---of 35
f_dupfd---of 4
fd_install---of 1
fget---of 1
fget_many---of 1
fget_raw---of 1
fget_task---of 3
free_fdtable_rcu---of 1
get_close_on_exec---of 19
get_files_struct---of 3
get_unused_fd_flags---of 1
iterate_fd---of 19
ksys_dup3---of 16
pick_file---of 10
put_files_struct---of 11
put_unused_fd---of 1
replace_fd---of 8
reset_files_struct---of 1
set_close_on_exec---of 11
__traceiter_irq_disable---of 4
__traceiter_irq_enable---of 4
perf_trace_preemptirq_template---of 6
trace_event_raw_event_preemptirq_template---of 10
trace_hardirqs_off40%of 10
trace_hardirqs_off_caller---of 10
trace_hardirqs_off_finish55%of 11
trace_hardirqs_on50%of 12
trace_hardirqs_on_caller---of 12
trace_hardirqs_on_prepare59%of 12
trace_raw_output_preemptirq_template---of 4
__do_compat_sys_getrusage---of 4
__do_compat_sys_sysinfo---of 7
__do_sys_geteuid---of 1
__do_sys_getgid---of 1
__do_sys_getpgrp---of 1
__do_sys_getppid---of 18
__do_sys_getpriority---of 63
__do_sys_getrusage---of 4
__do_sys_gettid---of 1
__do_sys_newuname---of 6
__do_sys_olduname---of 7
__do_sys_prctl---of 66
__do_sys_prlimit64---of 65
__do_sys_setpgid---of 34
__do_sys_setpriority---of 65
__do_sys_setsid---of 1
__do_sys_sysinfo---of 2
__do_sys_uname---of 7
__ia32_compat_sys_getrlimit---of 6
__ia32_compat_sys_getrusage---of 1
__ia32_compat_sys_old_getrlimit---of 4
__ia32_compat_sys_setrlimit---of 7
__ia32_compat_sys_sysinfo---of 1
__ia32_compat_sys_times---of 6
__ia32_sys_getcpu---of 6
__ia32_sys_gethostname---of 4
__ia32_sys_gethostname.cold---of 1
__ia32_sys_getpgid---of 1
__ia32_sys_getpriority---of 1
__ia32_sys_getresgid---of 4
__ia32_sys_getresuid---of 5
__ia32_sys_getrlimit---of 4
__ia32_sys_getrusage---of 1
__ia32_sys_getsid---of 20
__ia32_sys_newuname---of 1
__ia32_sys_old_getrlimit---of 7
__ia32_sys_olduname---of 1
__ia32_sys_prctl---of 1
__ia32_sys_prlimit64---of 1
__ia32_sys_setdomainname---of 5
__ia32_sys_setfsgid---of 1
__ia32_sys_setfsuid---of 1
__ia32_sys_setgid---of 1
__ia32_sys_sethostname---of 5
__ia32_sys_setpgid---of 1
__ia32_sys_setpriority---of 1
__ia32_sys_setregid---of 1
__ia32_sys_setresgid---of 1
__ia32_sys_setresuid---of 1
__ia32_sys_setreuid---of 1
__ia32_sys_setrlimit---of 3
__ia32_sys_setuid---of 1
__ia32_sys_sysinfo---of 1
__ia32_sys_times---of 6
__ia32_sys_umask---of 1
__ia32_sys_uname---of 1
__sys_setfsgid---of 14
__sys_setfsuid93%of 14
__sys_setgid---of 13
__sys_setregid---of 26
__sys_setresgid---of 39
__sys_setresuid85%of 40
__sys_setreuid---of 31
__sys_setuid---of 13
__x64_sys_getcpu---of 6
__x64_sys_getegid---of 1
__x64_sys_gethostname---of 4
__x64_sys_gethostname.cold---of 1
__x64_sys_getpgid---of 1
__x64_sys_getpid---of 1
__x64_sys_getpriority---of 1
__x64_sys_getresgid---of 4
__x64_sys_getresuid---of 5
__x64_sys_getrlimit---of 4
__x64_sys_getrusage---of 1
__x64_sys_getsid---of 20
__x64_sys_getuid---of 1
__x64_sys_newuname---of 1
__x64_sys_old_getrlimit---of 7
__x64_sys_olduname---of 1
__x64_sys_prctl---of 1
__x64_sys_prlimit64---of 1
__x64_sys_setdomainname---of 5
__x64_sys_setfsgid---of 1
__x64_sys_setfsuid100%of 1
__x64_sys_setgid---of 1
__x64_sys_sethostname---of 5
__x64_sys_setpgid---of 1
__x64_sys_setpriority---of 1
__x64_sys_setregid---of 1
__x64_sys_setresgid---of 1
__x64_sys_setresuid100%of 1
__x64_sys_setreuid---of 1
__x64_sys_setrlimit---of 3
__x64_sys_setuid---of 1
__x64_sys_sysinfo---of 1
__x64_sys_times---of 6
__x64_sys_umask---of 1
__x64_sys_uname---of 1
do_getpgid---of 19
do_prlimit---of 21
do_sys_times---of 1
do_sysinfo.isra.0---of 7
getrusage---of 39
ksys_setsid---of 10
override_release.part.0---of 9
prctl_set_auxv---of 4
prctl_set_mm---of 68
propagate_has_child_subreaper---of 5
set_one_prio---of 14
set_user72%of 7
__disable_kprobe---of 16
__disarm_kprobe---of 5
__free_insn_slot---of 27
__get_insn_slot---of 34
__get_valid_kprobe---of 13
__is_insn_slot_addr59%of 17
__kretprobe_trampoline_handler---of 20
__kretprobe_trampoline_handler.cold---of 2
__unregister_kprobe_bottom---of 5
__unregister_kprobe_top---of 26
aggr_fault_handler---of 5
aggr_post_handler---of 5
aggr_pre_handler---of 6
alloc_aggr_kprobe---of 5
arch_check_ftrace_location---of 1
arch_deref_entry_point---of 1
arch_kprobe_get_kallsym---of 1
arch_kprobe_on_func_entry---of 1
arch_within_kprobe_blacklist---of 3
arm_kprobe---of 6
cleanup_rp_inst---of 11
collect_garbage_slots---of 9
collect_one_slot.part.0---of 4
disable_kprobe---of 3
dump_kprobe---of 1
enable_kprobe---of 10
get_kprobe---of 4
get_optimized_kprobe---of 10
init_aggr_kprobe---of 6
is_cfi_preamble_symbol---of 10
is_cfi_preamble_symbol.cold---of 1
kill_kprobe---of 20
kprobe_add_area_blacklist---of 7
kprobe_add_ksym_blacklist---of 5
kprobe_blacklist_open---of 4
kprobe_blacklist_seq_next---of 1
kprobe_blacklist_seq_show---of 5
kprobe_blacklist_seq_start---of 1
kprobe_blacklist_seq_stop---of 1
kprobe_busy_begin---of 1
kprobe_busy_end---of 1
kprobe_cache_get_kallsym---of 18
kprobe_disarmed---of 6
kprobe_exceptions_notify---of 1
kprobe_flush_task---of 7
kprobe_free_init_mem---of 8
kprobe_get_kallsym---of 4
kprobe_lookup_name---of 1
kprobe_on_func_entry---of 10
kprobe_optimizer---of 36
kprobe_remove_area_blacklist---of 5
kprobe_seq_next---of 2
kprobe_seq_start---of 2
kprobes_inc_nmissed_count---of 4
kprobes_module_callback---of 41
kprobes_open---of 4
kretprobe_hash_lock---of 1
kretprobe_hash_unlock---of 1
kretprobe_table_lock---of 1
kretprobe_table_unlock---of 1
opt_pre_handler---of 5
optimize_all_kprobes---of 9
optimize_kprobe---of 16
optprobe_queued_unopt---of 6
pre_handler_kretprobe---of 13
proc_kprobes_optimization_handler---of 12
read_enabled_file_bool---of 2
recycle_rp_inst---of 8
register_kprobe---of 76
register_kprobes---of 6
register_kretprobe---of 34
register_kretprobes---of 6
report_probe---of 13
show_kprobe_addr---of 7
unoptimize_kprobe---of 15
unregister_kprobe---of 4
unregister_kprobes---of 2
unregister_kprobes.part.0---of 9
unregister_kretprobe---of 1
unregister_kretprobes---of 2
unregister_kretprobes.part.0---of 9
wait_for_kprobe_optimizer---of 5
within_kprobe_blacklist---of 8
within_kprobe_blacklist.part.0---of 9
write_enabled_file_bool---of 25
write_enabled_file_bool.cold---of 10
refcount_dec_and_lock---of 10
refcount_dec_and_lock_irqsave23%of 9
refcount_dec_and_mutex_lock---of 10
refcount_dec_if_one---of 1
refcount_dec_not_one67%of 9
refcount_warn_saturate---of 13
cap_bprm_creds_from_file---of 93
cap_bprm_creds_from_file.cold---of 2
cap_capable34%of 9
cap_capget---of 18
cap_capset---of 24
cap_convert_nscap---of 14
cap_inode_getsecurity---of 34
cap_inode_killpriv---of 2
cap_inode_need_killpriv---of 1
cap_inode_removexattr---of 7
cap_inode_setxattr---of 5
cap_mmap_addr---of 11
cap_mmap_file---of 1
cap_ptrace_access_check---of 27
cap_ptrace_traceme---of 24
cap_safe_nice---of 28
cap_settime---of 1
cap_task_fix_setuid47%of 30
cap_task_prctl---of 45
cap_task_setioprio---of 1
cap_task_setnice---of 1
cap_task_setscheduler---of 1
cap_vm_enough_memory---of 9
get_vfs_caps_from_disk---of 20
page_counter_cancel50%of 2
page_counter_charge---of 5
page_counter_memparse---of 5
page_counter_set_low---of 2
page_counter_set_max---of 5
page_counter_set_min---of 2
page_counter_try_charge67%of 9
page_counter_uncharge100%of 2
propagate_protected_usage78%of 9
__do_SAK---of 2
__do_SAK.part.0---of 16
__do_SAK.part.0.cold---of 10
__start_tty---of 4
__stop_tty---of 3
__tty_alloc_driver---of 15
__tty_hangup.part.0---of 30
alloc_tty_struct---of 8
check_tty_count.isra.0---of 7
check_tty_count.isra.0.cold---of 4
compat_tty_tiocgserial---of 6
compat_tty_tiocsserial---of 7
console_sysfs_notify---of 2
destruct_tty_driver---of 10
do_SAK---of 2
do_SAK_work---of 2
do_tty_hangup---of 2
file_tty_write.constprop.0---of 32
file_tty_write.constprop.0.cold---of 3
get_order---of 1
hung_up_tty_compat_ioctl---of 2
hung_up_tty_fasync---of 1
hung_up_tty_ioctl---of 2
hung_up_tty_poll---of 1
hung_up_tty_read---of 1
hung_up_tty_write---of 1
put_tty_driver---of 5
redirected_tty_write---of 5
release_one_tty---of 8
release_tty---of 30
send_break.part.0---of 8
show_cons_active---of 21
start_tty---of 5
stop_tty---of 4
this_tty---of 4
tty_add_file---of 1
tty_alloc_file---of 3
tty_cdev_add---of 4
tty_compat_ioctl---of 19
tty_default_fops---of 1
tty_dev_name_to_number---of 13
tty_device_create_release---of 1
tty_devnode---of 5
tty_devnum---of 1
tty_do_resize---of 5
tty_driver_kref_put---of 5
tty_driver_name---of 5
tty_fasync---of 15
tty_fasync.cold---of 2
tty_free_file---of 1
tty_hangup---of 1
tty_hung_up_p---of 4
tty_init_dev---of 3
tty_init_dev.part.0---of 22
tty_init_dev.part.0.cold---of 3
tty_init_termios---of 6
tty_ioctl10%of 96
tty_kclose---of 3
tty_kopen---of 25
tty_kref_put---of 6
tty_lookup_driver---of 31
tty_name---of 4
tty_open---of 74
tty_paranoia_check100%of 3
tty_paranoia_check.cold---of 2
tty_poll---of 7
tty_poll.cold---of 2
tty_put_char---of 4
tty_read---of 24
tty_register_device---of 1
tty_register_device_attr---of 17
tty_register_device_attr.cold---of 1
tty_register_driver---of 17
tty_release---of 55
tty_release.cold---of 12
tty_release_struct---of 3
tty_reopen---of 12
tty_save_termios---of 5
tty_send_xchar---of 9
tty_set_operations---of 1
tty_show_fdinfo---of 4
tty_standard_install---of 6
tty_unregister_device---of 2
tty_unregister_driver---of 1
tty_vhangup---of 2
tty_vhangup_self---of 6
tty_vhangup_session---of 2
tty_wakeup---of 6
tty_write---of 1
tty_write_lock---of 4
tty_write_message---of 5
tty_write_unlock---of 1
__traceiter_x86_fpu_after_restore---of 4
__traceiter_x86_fpu_after_save---of 4
__traceiter_x86_fpu_before_restore---of 4
__traceiter_x86_fpu_before_save---of 4
__traceiter_x86_fpu_copy_dst---of 4
__traceiter_x86_fpu_copy_src---of 4
__traceiter_x86_fpu_dropped---of 4
__traceiter_x86_fpu_init_state---of 4
__traceiter_x86_fpu_regs_activated---of 4
__traceiter_x86_fpu_regs_deactivated---of 4
__traceiter_x86_fpu_xstate_check_failed---of 4
copy_fpregs_to_fpstate---of 10
copy_init_fpstate_to_fpregs---of 6
copy_kernel_to_fpregs---of 8
fpregs_assert_state_consistent75%of 4
fpregs_mark_activate---of 10
fpstate_init---of 4
fpu__clear---of 12
fpu__clear_all---of 1
fpu__clear_user_states---of 1
fpu__copy---of 26
fpu__drop---of 20
fpu__exception_code---of 9
fpu__prepare_read---of 2
fpu__prepare_write---of 3
fpu__save---of 25
irq_fpu_usable---of 7
kernel_fpu_begin_mask---of 12
kernel_fpu_end---of 3
perf_trace_x86_fpu---of 8
switch_fpu_return---of 15
trace_event_raw_event_x86_fpu---of 12
trace_raw_output_x86_fpu---of 4
__vt_event_wait.part.0---of 6
change_console---of 9
complete_change_console---of 13
pm_set_vt_switch---of 1
reset_vc---of 2
vc_SAK---of 7
vt_compat_ioctl---of 19
vt_disallocate_all---of 16
vt_event_post---of 7
vt_ioctl4%of 209
vt_move_to_console---of 10
vt_waitactive---of 7
call_blocking_lsm_notifier---of 1
get_order---of 1
inode_free_by_rcu---of 1
lsm_append.constprop.0---of 10
lsm_inode_alloc---of 5
register_blocking_lsm_notifier---of 1
security_add_mnt_opt---of 4
security_audit_rule_free---of 2
security_audit_rule_init---of 4
security_audit_rule_known---of 4
security_audit_rule_match---of 4
security_binder_set_context_mgr---of 4
security_binder_transaction---of 4
security_binder_transfer_binder---of 4
security_binder_transfer_file---of 4
security_bprm_check---of 5
security_bprm_committed_creds---of 2
security_bprm_committing_creds---of 2
security_bprm_creds_for_exec---of 4
security_bprm_creds_from_file---of 4
security_capable100%of 4
security_capget---of 4
security_capset---of 4
security_cred_alloc_blank---of 9
security_cred_free---of 5
security_cred_getsecid---of 2
security_d_instantiate---of 4
security_dentry_create_files_as---of 4
security_dentry_init_security---of 4
security_file_alloc---of 9
security_file_fcntl---of 4
security_file_free---of 4
security_file_ioctl100%of 4
security_file_ioctl_compat---of 4
security_file_lock---of 4
security_file_mprotect---of 5
security_file_open---of 20
security_file_permission---of 23
security_file_receive---of 4
security_file_send_sigiotask---of 4
security_file_set_fowner---of 2
security_free_mnt_opts---of 5
security_fs_context_dup---of 4
security_fs_context_parse_param---of 5
security_getprocattr---of 6
security_inet_conn_established---of 2
security_inet_conn_request---of 4
security_inet_csk_clone---of 2
security_inode_alloc---of 9
security_inode_copy_up---of 4
security_inode_copy_up_xattr---of 5
security_inode_create---of 6
security_inode_follow_link---of 6
security_inode_free---of 4
security_inode_getattr---of 5
security_inode_getsecctx---of 4
security_inode_getsecid---of 2
security_inode_getsecurity---of 6
security_inode_getxattr---of 5
security_inode_init_security---of 15
security_inode_invalidate_secctx---of 2
security_inode_killpriv---of 4
security_inode_link---of 5
security_inode_listsecurity---of 5
security_inode_listxattr---of 5
security_inode_mkdir---of 6
security_inode_mknod---of 6
security_inode_need_killpriv---of 4
security_inode_notifysecctx---of 4
security_inode_permission---of 5
security_inode_post_setxattr---of 3
security_inode_readlink---of 5
security_inode_removexattr---of 8
security_inode_rename---of 12
security_inode_rmdir---of 5
security_inode_setattr---of 6
security_inode_setsecctx---of 4
security_inode_setsecurity---of 5
security_inode_setxattr---of 8
security_inode_symlink---of 5
security_inode_unlink---of 5
security_ipc_getsecid---of 2
security_ipc_permission---of 5
security_ismaclabel---of 4
security_kernel_act_as---of 4
security_kernel_create_files_as---of 4
security_kernel_load_data---of 5
security_kernel_module_request---of 5
security_kernel_post_load_data---of 5
security_kernel_post_read_file---of 5
security_kernel_read_file---of 5
security_kernfs_init_security---of 4
security_key_alloc---of 4
security_key_free---of 2
security_key_getsecurity---of 4
security_key_permission---of 4
security_locked_down---of 4
security_mmap_addr---of 4
security_mmap_file---of 10
security_move_mount---of 4
security_msg_msg_alloc---of 11
security_msg_msg_free---of 3
security_msg_queue_alloc---of 11
security_msg_queue_associate---of 4
security_msg_queue_free---of 3
security_msg_queue_msgctl---of 4
security_msg_queue_msgrcv---of 4
security_msg_queue_msgsnd---of 4
security_netlink_send---of 4
security_old_inode_init_security---of 5
security_path_notify---of 4
security_perf_event_alloc---of 4
security_perf_event_free---of 2
security_perf_event_open---of 4
security_perf_event_read---of 4
security_perf_event_write---of 4
security_prepare_creds78%of 9
security_ptrace_access_check---of 4
security_ptrace_traceme---of 4
security_quota_on---of 4
security_quotactl---of 4
security_release_secctx---of 2
security_req_classify_flow---of 2
security_sb_alloc---of 4
security_sb_clone_mnt_opts---of 4
security_sb_eat_lsm_opts---of 4
security_sb_free---of 2
security_sb_kern_mount---of 4
security_sb_mount---of 4
security_sb_pivotroot---of 4
security_sb_remount---of 4
security_sb_set_mnt_opts---of 5
security_sb_show_options---of 4
security_sb_statfs---of 4
security_sb_umount---of 4
security_sctp_assoc_request---of 4
security_sctp_bind_connect---of 4
security_sctp_sk_clone---of 2
security_secctx_to_secid---of 4
security_secid_to_secctx---of 4
security_secmark_refcount_dec---of 2
security_secmark_refcount_inc---of 2
security_secmark_relabel_packet---of 4
security_sem_alloc---of 11
security_sem_associate---of 4
security_sem_free---of 3
security_sem_semctl---of 4
security_sem_semop---of 4
security_setprocattr---of 6
security_settime64---of 4
security_shm_alloc---of 11
security_shm_associate---of 4
security_shm_free---of 3
security_shm_shmat---of 4
security_shm_shmctl---of 4
security_sk_alloc---of 4
security_sk_classify_flow---of 3
security_sk_clone---of 2
security_sk_free---of 2
security_sock_graft---of 2
security_sock_rcv_skb---of 4
security_socket_accept---of 4
security_socket_bind---of 4
security_socket_connect---of 4
security_socket_create---of 4
security_socket_getpeername---of 4
security_socket_getpeersec_dgram---of 4
security_socket_getpeersec_stream---of 4
security_socket_getsockname---of 4
security_socket_getsockopt---of 4
security_socket_listen---of 4
security_socket_post_create---of 4
security_socket_recvmsg---of 4
security_socket_sendmsg---of 4
security_socket_setsockopt---of 4
security_socket_shutdown---of 4
security_socket_socketpair---of 4
security_syslog---of 4
security_task_alloc---of 11
security_task_fix_setgid---of 4
security_task_fix_setuid100%of 4
security_task_free---of 3
security_task_getioprio---of 4
security_task_getpgid---of 4
security_task_getscheduler---of 4
security_task_getsecid---of 2
security_task_getsid---of 4
security_task_kill---of 4
security_task_movememory---of 4
security_task_prctl---of 5
security_task_prlimit---of 4
security_task_setioprio---of 4
security_task_setnice---of 4
security_task_setpgid---of 4
security_task_setrlimit---of 4
security_task_setscheduler---of 4
security_task_to_inode---of 2
security_transfer_creds---of 2
security_tun_dev_alloc_security---of 4
security_tun_dev_attach---of 4
security_tun_dev_attach_queue---of 4
security_tun_dev_create---of 4
security_tun_dev_free_security---of 2
security_tun_dev_open---of 4
security_unix_may_send---of 4
security_unix_stream_connect---of 4
security_vm_enough_memory_mm---of 4
unregister_blocking_lsm_notifier---of 1
__do_sys_uselib---of 24
__get_task_comm---of 1
__ia32_compat_sys_execve---of 1
__ia32_compat_sys_execveat---of 1
__ia32_sys_execve---of 1
__ia32_sys_execveat---of 1
__ia32_sys_uselib---of 1
__register_binfmt---of 8
__set_task_comm---of 10
__x64_sys_execve---of 1
__x64_sys_execveat---of 1
__x64_sys_uselib---of 1
acct_arg_size---of 3
alloc_bprm---of 23
begin_new_exec---of 98
bprm_change_interp---of 4
bprm_execve---of 109
copy_string_kernel---of 16
copy_string_kernel.cold---of 1
copy_strings.isra.0---of 30
copy_strings_kernel---of 6
count.constprop.0---of 9
count_strings_kernel.part.0---of 6
do_execveat_common---of 21
do_execveat_common.cold---of 1
do_open_execat---of 23
finalize_exec---of 1
free_bprm---of 11
get_arg_page---of 6
get_user_arg_ptr.isra.0---of 6
kernel_execve---of 19
open_exec---of 4
path_noexec---of 3
remove_arg_zero---of 16
set_binfmt---of 4
set_dumpable75%of 4
setup_arg_pages---of 21
setup_arg_pages.cold---of 1
setup_new_exec---of 5
shift_arg_pages---of 14
unregister_binfmt---of 1
would_dump---of 10
cn_proc_mcast_ctl---of 23
proc_comm_connector---of 16
proc_coredump_connector---of 35
proc_exec_connector---of 16
proc_exit_connector---of 35
proc_fork_connector---of 33
proc_id_connector7%of 44
proc_ptrace_connector---of 20
proc_sid_connector---of 16
__put_cred56%of 9
abort_creds80%of 5
commit_creds77%of 47
copy_creds---of 20
cred_alloc_blank---of 4
cred_fscmp---of 15
exit_creds---of 6
get_task_cred---of 24
override_creds---of 1
prepare_creds43%of 35
prepare_exec_creds---of 3
prepare_kernel_cred---of 23
put_cred_rcu---of 9
revert_creds---of 3
set_create_files_as---of 4
set_security_override---of 1
set_security_override_from_ctx---of 3
__traceiter_selinux_audited---of 4
avc_alloc_node---of 34
avc_audit_post_callback---of 30
avc_audit_pre_callback---of 11
avc_compute_av.isra.0---of 28
avc_compute_av.isra.0.cold---of 1
avc_copy_xperms_decision---of 6
avc_denied---of 6
avc_disable---of 2
avc_flush---of 18
avc_get_cache_threshold---of 1
avc_get_hash_stats---of 19
avc_has_extended_perms24%of 64
avc_has_perm---of 7
avc_has_perm_flags---of 8
avc_has_perm_noaudit58%of 21
avc_lookup89%of 9
avc_node_delete---of 3
avc_node_free---of 1
avc_node_kill---of 1
avc_node_replace---of 3
avc_policy_seqno---of 1
avc_set_cache_threshold---of 1
avc_ss_reset---of 7
avc_update_node.part.0.isra.0---of 26
avc_xperms_decision_alloc---of 9
avc_xperms_decision_free---of 7
avc_xperms_free---of 4
avc_xperms_populate.part.0---of 7
perf_trace_selinux_audited---of 9
selinux_avc_init---of 3
slow_avc_audit---of 6
trace_event_get_offsets_selinux_audited.constprop.0---of 4
trace_event_raw_event_selinux_audited---of 13
trace_raw_output_selinux_audited---of 4
__inode_security_revalidate---of 9
audit_inode_permission---of 3
backing_inode_security---of 8
check_nnp_nosuid.isra.0---of 10
cred_has_capability.isra.073%of 11
cred_has_capability.isra.0.cold---of 1
delayed_superblock_init---of 1
file_has_perm---of 5
file_map_prot_check---of 24
get_order---of 1
has_cap_mac_admin---of 5
inode_doinit_use_xattr---of 13
inode_doinit_use_xattr.cold---of 3
inode_doinit_with_dentry---of 58
inode_has_perm---of 5
inode_security88%of 8
ioctl_has_perm.constprop.0.isra.080%of 5
ipc_has_perm---of 1
match_file---of 8
may_context_mount_inode_relabel.isra.0---of 3
may_context_mount_sb_relabel.isra.0---of 4
may_create---of 5
may_link---of 6
ptrace_parent_sid---of 21
sb_finish_set_opts---of 23
sb_finish_set_opts.cold---of 4
selinux_add_mnt_opt---of 11
selinux_add_opt---of 19
selinux_add_opt.cold---of 1
selinux_binder_set_context_mgr---of 1
selinux_binder_transaction---of 4
selinux_binder_transfer_binder---of 1
selinux_binder_transfer_file---of 10
selinux_bprm_committed_creds---of 7
selinux_bprm_committing_creds---of 16
selinux_bprm_creds_for_exec---of 17
selinux_capable100%of 1
selinux_capget---of 1
selinux_capset---of 1
selinux_complete_init---of 1
selinux_cred_getsecid---of 1
selinux_cred_prepare100%of 1
selinux_cred_transfer---of 1
selinux_d_instantiate---of 2
selinux_dentry_create_files_as---of 4
selinux_dentry_init_security---of 4
selinux_determine_inode_label---of 8
selinux_disable---of 3
selinux_disable.cold---of 5
selinux_file_alloc_security---of 1
selinux_file_fcntl---of 7
selinux_file_ioctl34%of 9
selinux_file_ioctl_compat---of 2
selinux_file_lock---of 4
selinux_file_mprotect---of 24
selinux_file_open---of 9
selinux_file_permission---of 22
selinux_file_receive---of 6
selinux_file_send_sigiotask---of 4
selinux_file_set_fowner---of 1
selinux_free_mnt_opts---of 1
selinux_fs_context_dup---of 11
selinux_fs_context_parse_param---of 4
selinux_getprocattr---of 42
selinux_inet_conn_established---of 4
selinux_inet_conn_request---of 7
selinux_inet_csk_clone---of 1
selinux_inet_sys_rcv_skb---of 5
selinux_inode_alloc_security---of 3
selinux_inode_copy_up---of 6
selinux_inode_copy_up_xattr---of 2
selinux_inode_create---of 1
selinux_inode_follow_link---of 9
selinux_inode_free_security---of 5
selinux_inode_getattr---of 1
selinux_inode_getsecctx---of 3
selinux_inode_getsecid---of 3
selinux_inode_getsecurity---of 10
selinux_inode_getxattr---of 1
selinux_inode_init_security---of 16
selinux_inode_invalidate_secctx---of 3
selinux_inode_link---of 1
selinux_inode_listsecurity---of 5
selinux_inode_listxattr---of 1
selinux_inode_mkdir---of 1
selinux_inode_mknod---of 2
selinux_inode_notifysecctx---of 2
selinux_inode_permission---of 34
selinux_inode_post_setxattr---of 6
selinux_inode_post_setxattr.cold---of 1
selinux_inode_readlink---of 1
selinux_inode_removexattr---of 5
selinux_inode_rename---of 11
selinux_inode_rmdir---of 1
selinux_inode_setattr---of 9
selinux_inode_setsecctx---of 1
selinux_inode_setsecurity---of 10
selinux_inode_setxattr---of 24
selinux_inode_symlink---of 1
selinux_inode_unlink---of 1
selinux_ip_forward---of 16
selinux_ip_postroute---of 50
selinux_ip_postroute_compat---of 9
selinux_ipc_getsecid---of 1
selinux_ipc_permission---of 6
selinux_ipv4_forward---of 1
selinux_ipv4_output---of 7
selinux_ipv4_postroute---of 1
selinux_ipv6_forward---of 1
selinux_ipv6_output---of 7
selinux_ipv6_postroute---of 1
selinux_ismaclabel---of 1
selinux_kernel_act_as---of 3
selinux_kernel_create_files_as---of 5
selinux_kernel_load_data---of 4
selinux_kernel_module_request---of 1
selinux_kernel_read_file---of 10
selinux_kernfs_init_security---of 15
selinux_key_alloc---of 5
selinux_key_free---of 1
selinux_key_getsecurity---of 3
selinux_key_permission---of 4
selinux_lockdown---of 7
selinux_lsm_notifier_avc_callback---of 4
selinux_mmap_addr---of 4
selinux_mmap_file---of 5
selinux_mount---of 4
selinux_move_mount---of 1
selinux_msg_msg_alloc_security---of 1
selinux_msg_queue_alloc_security---of 1
selinux_msg_queue_associate---of 1
selinux_msg_queue_msgctl---of 6
selinux_msg_queue_msgrcv---of 3
selinux_msg_queue_msgsnd---of 6
selinux_netcache_avc_callback---of 4
selinux_netlink_send---of 13
selinux_netlink_send.cold---of 1
selinux_nf_register---of 1
selinux_nf_unregister---of 1
selinux_parse_skb.constprop.0---of 62
selinux_path_notify---of 8
selinux_perf_event_alloc---of 3
selinux_perf_event_free---of 1
selinux_perf_event_open---of 6
selinux_perf_event_read---of 1
selinux_perf_event_write---of 1
selinux_ptrace_access_check---of 4
selinux_ptrace_traceme---of 1
selinux_quota_on---of 1
selinux_quotactl---of 5
selinux_release_secctx---of 1
selinux_req_classify_flow---of 1
selinux_sb_alloc_security---of 3
selinux_sb_clone_mnt_opts---of 38
selinux_sb_clone_mnt_opts.cold---of 1
selinux_sb_eat_lsm_opts---of 37
selinux_sb_free_security---of 1
selinux_sb_kern_mount---of 1
selinux_sb_remount---of 27
selinux_sb_remount.cold---of 5
selinux_sb_show_options---of 17
selinux_sb_statfs---of 1
selinux_sctp_assoc_request---of 19
selinux_sctp_bind_connect---of 13
selinux_sctp_sk_clone---of 3
selinux_secctx_to_secid---of 1
selinux_secid_to_secctx---of 1
selinux_secmark_refcount_dec---of 1
selinux_secmark_refcount_inc---of 1
selinux_secmark_relabel_packet---of 1
selinux_sem_alloc_security---of 1
selinux_sem_associate---of 1
selinux_sem_semctl---of 6
selinux_sem_semop---of 2
selinux_set_mnt_opts---of 92
selinux_set_mnt_opts.cold---of 8
selinux_setprocattr---of 46
selinux_shm_alloc_security---of 1
selinux_shm_associate---of 1
selinux_shm_shmat---of 2
selinux_shm_shmctl---of 6
selinux_sk_alloc_security---of 5
selinux_sk_clone_security---of 1
selinux_sk_free_security---of 1
selinux_sk_getsecid---of 3
selinux_skb_peerlbl_sid---of 3
selinux_skb_peerlbl_sid.cold---of 1
selinux_sock_graft---of 6
selinux_sock_rcv_skb_compat---of 6
selinux_socket_accept---of 7
selinux_socket_bind---of 31
selinux_socket_connect---of 3
selinux_socket_connect_helper.isra.0---of 17
selinux_socket_create---of 24
selinux_socket_getpeername---of 1
selinux_socket_getpeersec_dgram---of 14
selinux_socket_getpeersec_stream---of 15
selinux_socket_getsockname---of 1
selinux_socket_getsockopt---of 1
selinux_socket_listen---of 1
selinux_socket_post_create---of 29
selinux_socket_recvmsg---of 1
selinux_socket_sendmsg---of 1
selinux_socket_setsockopt---of 3
selinux_socket_shutdown---of 1
selinux_socket_sock_rcv_skb---of 21
selinux_socket_socketpair---of 1
selinux_socket_unix_may_send---of 1
selinux_socket_unix_stream_connect---of 4
selinux_syslog---of 5
selinux_task_alloc---of 1
selinux_task_getioprio---of 1
selinux_task_getpgid---of 1
selinux_task_getscheduler---of 1
selinux_task_getsecid---of 1
selinux_task_getsid---of 1
selinux_task_kill---of 8
selinux_task_movememory---of 1
selinux_task_prlimit---of 6
selinux_task_setioprio---of 1
selinux_task_setnice---of 1
selinux_task_setpgid---of 1
selinux_task_setrlimit---of 4
selinux_task_setscheduler---of 1
selinux_task_to_inode---of 4
selinux_tun_dev_alloc_security---of 3
selinux_tun_dev_attach---of 1
selinux_tun_dev_attach_queue---of 1
selinux_tun_dev_create---of 1
selinux_tun_dev_free_security---of 1
selinux_tun_dev_open---of 5
selinux_umount---of 1
selinux_vm_enough_memory---of 1
show_sid---of 6
sock_has_perm---of 3
task_sid---of 18
__proc_set_tty---of 21
__tty_check_change---of 4
__tty_check_change.part.0---of 22
disassociate_ctty---of 2
disassociate_ctty.part.0---of 16
get_current_tty---of 7
no_tty---of 3
proc_clear_tty---of 1
session_clear_tty---of 5
tty_check_change---of 4
tty_get_pgrp---of 7
tty_jobctrl_ioctl4%of 58
tty_open_proc_set_tty---of 6
tty_signal_session_leader---of 29
perf_kprobe_destroy---of 1
perf_kprobe_init---of 13
perf_trace_add---of 8
perf_trace_buf_alloc67%of 9
perf_trace_buf_update100%of 1
perf_trace_del---of 4
perf_trace_destroy---of 1
perf_trace_event_init---of 35
perf_trace_event_unreg.isra.0---of 4
perf_trace_init---of 9
perf_uprobe_destroy---of 1
perf_uprobe_init---of 11
___perf_sw_event---of 31
__do_sys_perf_event_open---of 216
__ia32_sys_perf_event_open---of 1
__perf_addr_filters_adjust---of 18
__perf_event__output_id_sample---of 12
__perf_event_disable---of 9
__perf_event_enable---of 21
__perf_event_exit_context---of 11
__perf_event_header__init_id---of 18
__perf_event_init_context---of 1
__perf_event_output_stop---of 11
__perf_event_overflow---of 17
__perf_event_period---of 11
__perf_event_read---of 20
__perf_event_read_size---of 9
__perf_event_read_value---of 3
__perf_event_stop---of 5
__perf_event_task_sched_in---of 26
__perf_event_task_sched_out---of 111
__perf_install_in_context---of 15
__perf_pmu_output_stop---of 15
__perf_pmu_sched_task---of 11
__perf_read_group_add---of 25
__perf_remove_from_context---of 10
__perf_sw_event---of 3
__refcount_add.constprop.0---of 5
__x64_sys_perf_event_open---of 1
_free_event---of 48
_perf_event_disable---of 3
_perf_event_enable---of 7
_perf_event_period---of 9
_perf_event_reset---of 1
_perf_ioctl---of 134
calc_timer_values---of 12
cpu_clock_event_add---of 4
cpu_clock_event_del---of 3
cpu_clock_event_init---of 7
cpu_clock_event_read---of 1
cpu_clock_event_start---of 2
cpu_clock_event_stop---of 3
ctx_resched---of 15
ctx_sched_in.constprop.0---of 18
ctx_sched_out---of 36
event_function---of 18
event_function_call---of 15
event_function_local.constprop.0---of 25
event_sched_in---of 34
event_sched_out---of 30
exclusive_event_destroy---of 4
exclusive_event_installable---of 12
find_get_context---of 47
free_ctx---of 4
free_event---of 3
free_event_rcu---of 3
get_event_type---of 8
get_order---of 1
group_sched_out.part.0---of 7
inherit_event.constprop.0---of 24
inherit_task_group.isra.0---of 27
ktime_get_boottime_ns---of 1
ktime_get_clocktai_ns---of 1
ktime_get_real_ns---of 1
list_add_event---of 15
list_del_event---of 19
local_clock---of 1
merge_sched_in---of 51
nr_addr_filters_show---of 1
perf_addr_filters_splice---of 11
perf_adjust_period---of 31
perf_bp_event---of 4
perf_callchain---of 5
perf_compat_ioctl---of 6
perf_copy_attr---of 45
perf_cpu_time_max_percent_handler---of 8
perf_cpu_time_max_percent_handler.cold---of 1
perf_duration_warn---of 1
perf_duration_warn.cold---of 1
perf_event__header_size---of 18
perf_event__id_header_size---of 12
perf_event__output_id_sample---of 2
perf_event_account_interrupt---of 8
perf_event_addr_filters_apply---of 24
perf_event_addr_filters_exec---of 11
perf_event_addr_filters_sync---of 7
perf_event_alloc.part.0---of 151
perf_event_attrs---of 4
perf_event_aux_event---of 8
perf_event_bpf_event---of 9
perf_event_bpf_output---of 9
perf_event_comm---of 8
perf_event_comm.cold---of 1
perf_event_comm_output---of 20
perf_event_create_kernel_counter---of 18
perf_event_ctx_lock_nested---of 33
perf_event_delayed_put---of 4
perf_event_disable---of 4
perf_event_disable_inatomic---of 1
perf_event_disable_local---of 1
perf_event_enable---of 1
perf_event_exec---of 35
perf_event_exit_cpu---of 1
perf_event_exit_cpu_context---of 3
perf_event_exit_task---of 44
perf_event_for_each_child---of 5
perf_event_fork---of 2
perf_event_free_task---of 21
perf_event_get---of 5
perf_event_groups_insert---of 9
perf_event_header__init_id---of 2
perf_event_idx_default---of 1
perf_event_init_cpu---of 13
perf_event_init_task---of 28
perf_event_itrace_started---of 1
perf_event_ksymbol---of 12
perf_event_ksymbol.cold---of 1
perf_event_ksymbol_output---of 13
perf_event_mmap---of 58
perf_event_mmap_output---of 27
perf_event_mux_interval_ms_show---of 1
perf_event_mux_interval_ms_store---of 9
perf_event_namespaces---of 2
perf_event_namespaces.part.0---of 1
perf_event_namespaces_output---of 16
perf_event_nop_int---of 1
perf_event_output---of 15
perf_event_output_backward---of 15
perf_event_output_forward---of 15
perf_event_overflow---of 1
perf_event_pause---of 8
perf_event_period---of 1
perf_event_read---of 23
perf_event_read_event---of 14
perf_event_read_local---of 19
perf_event_read_value---of 1
perf_event_refresh---of 4
perf_event_release_kernel---of 50
perf_event_sched_in.isra.0---of 3
perf_event_set_output---of 22
perf_event_set_state.part.0---of 11
perf_event_stop.isra.0---of 4
perf_event_switch_output---of 21
perf_event_sysfs_show---of 3
perf_event_task---of 7
perf_event_task_disable---of 13
perf_event_task_enable---of 7
perf_event_task_output---of 31
perf_event_task_tick---of 36
perf_event_text_poke---of 4
perf_event_text_poke_output---of 22
perf_event_update_time---of 7
perf_event_update_userpage---of 25
perf_event_wakeup---of 3
perf_exclude_event---of 8
perf_fasync---of 1
perf_fill_ns_link_info---of 3
perf_get_aux_event---of 14
perf_get_event---of 4
perf_group_attach---of 10
perf_group_detach---of 40
perf_install_in_context---of 26
perf_ioctl---of 3
perf_iterate_ctx---of 14
perf_iterate_sb---of 37
perf_kprobe_event_init---of 8
perf_lock_task_context---of 63
perf_log_itrace_start---of 17
perf_log_lost_samples---of 8
perf_log_throttle---of 12
perf_mmap---of 83
perf_mmap_close---of 67
perf_mmap_fault---of 30
perf_mmap_open---of 4
perf_mux_hrtimer_handler---of 55
perf_mux_hrtimer_restart_ipi---of 5
perf_output_read---of 57
perf_output_sample---of 96
perf_output_sample_regs---of 3
perf_pending_event---of 12
perf_pmu_cancel_txn---of 3
perf_pmu_commit_txn---of 5
perf_pmu_disable---of 2
perf_pmu_enable---of 2
perf_pmu_migrate_context---of 24
perf_pmu_name---of 1
perf_pmu_nop_int---of 1
perf_pmu_register---of 52
perf_pmu_resched---of 4
perf_pmu_snapshot_aux---of 5
perf_pmu_start_txn---of 3
perf_pmu_unregister---of 9
perf_poll---of 10
perf_prepare_sample---of 93
perf_proc_update_handler---of 9
perf_read---of 35
perf_reboot---of 3
perf_register_guest_info_callbacks---of 5
perf_release---of 1
perf_remove_from_context---of 8
perf_sample_event_took---of 6
perf_sched_cb_dec---of 2
perf_sched_cb_inc---of 3
perf_sched_delayed---of 3
perf_swevent_add---of 14
perf_swevent_del---of 3
perf_swevent_event12%of 18
perf_swevent_get_recursion_context100%of 3
perf_swevent_hrtimer---of 8
perf_swevent_init---of 23
perf_swevent_put_recursion_context---of 1
perf_swevent_set_period---of 4
perf_swevent_start---of 1
perf_swevent_start_hrtimer.part.0---of 5
perf_swevent_stop---of 1
perf_tp_event23%of 45
perf_tp_event_init---of 5
perf_trace_run_bpf_submit50%of 4
perf_try_init_event---of 21
perf_unregister_guest_info_callbacks---of 5
perf_uprobe_event_init---of 8
pmu_dev_alloc---of 8
pmu_dev_is_visible---of 5
pmu_dev_release---of 1
put_ctx---of 13
rb_free_rcu---of 1
ref_ctr_offset_show---of 1
remote_function---of 4
retprobe_show---of 1
ring_buffer_attach---of 15
ring_buffer_get---of 30
ring_buffer_put---of 7
ring_buffer_wakeup---of 22
sw_perf_event_destroy---of 5
swevent_hlist_put_cpu---of 9
task_clock_event_add---of 4
task_clock_event_del---of 3
task_clock_event_init---of 7
task_clock_event_read---of 1
task_clock_event_start---of 2
task_clock_event_stop---of 3
task_function_call---of 6
tp_perf_event_destroy---of 1
type_show---of 1
unclone_ctx---of 6
visit_groups_merge.constprop.0.isra.0---of 52