1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UACCESS_64_H #define _ASM_X86_UACCESS_64_H /* * User space memory access functions */ #include <linux/compiler.h> #include <linux/lockdep.h> #include <linux/kasan-checks.h> #include <asm/alternative.h> #include <asm/cpufeatures.h> #include <asm/page.h> /* * Copy To/From Userspace */ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_unrolled(void *to, const void *from, unsigned len); static __always_inline __must_check unsigned long copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; /* * If CPU has ERMS feature, use copy_user_enhanced_fast_string. * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. * Otherwise, use copy_user_generic_unrolled. */ alternative_call_2(copy_user_generic_unrolled, copy_user_generic_string, X86_FEATURE_REP_GOOD, copy_user_enhanced_fast_string, X86_FEATURE_ERMS, ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), "=d" (len)), "1" (to), "2" (from), "3" (len) : "memory", "rcx", "r8", "r9", "r10", "r11"); return ret; } static __always_inline __must_check unsigned long raw_copy_from_user(void *dst, const void __user *src, unsigned long size) { return copy_user_generic(dst, (__force void *)src, size); } static __always_inline __must_check unsigned long raw_copy_to_user(void __user *dst, const void *src, unsigned long size) { return copy_user_generic((__force void *)dst, src, size); } static __always_inline __must_check unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size) { return copy_user_generic((__force void *)dst, (__force void *)src, size); } extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size); extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len); static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { kasan_check_write(dst, size); return __copy_user_nocache(dst, src, size, 0); } static inline int __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) { kasan_check_write(dst, size); return __copy_user_flushcache(dst, src, size); } #endif /* _ASM_X86_UACCESS_64_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _IPV6_FRAG_H #define _IPV6_FRAG_H #include <linux/kernel.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/inet_frag.h> enum ip6_defrag_users { IP6_DEFRAG_LOCAL_DELIVER, IP6_DEFRAG_CONNTRACK_IN, __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, IP6_DEFRAG_CONNTRACK_OUT, __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, IP6_DEFRAG_CONNTRACK_BRIDGE_IN, __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, }; /* * Equivalent of ipv4 struct ip */ struct frag_queue { struct inet_frag_queue q; int iif; __u16 nhoffset; u8 ecn; }; #if IS_ENABLED(CONFIG_IPV6) static inline void ip6frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); const struct frag_v6_compare_key *key = a; q->key.v6 = *key; fq->ecn = 0; } static inline u32 ip6frag_key_hashfn(const void *data, u32 len, u32 seed) { return jhash2(data, sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); } static inline u32 ip6frag_obj_hashfn(const void *data, u32 len, u32 seed) { const struct inet_frag_queue *fq = data; return jhash2((const u32 *)&fq->key.v6, sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); } static inline int ip6frag_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) { const struct frag_v6_compare_key *key = arg->key; const struct inet_frag_queue *fq = ptr; return !!memcmp(&fq->key, key, sizeof(*key)); } static inline void ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; struct sk_buff *head; rcu_read_lock(); if (fq->q.fqdir->dead) goto out_rcu_unlock; spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) goto out; inet_frag_kill(&fq->q); dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) goto out; __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); /* Don't send error if the first segment did not arrive. */ if (!(fq->q.flags & INET_FRAG_FIRST_IN)) goto out; /* sk_buff::dev and sk_buff::rbnode are unionized. So we * pull the head out of the tree in order to be able to * deal with head->dev. */ head = inet_frag_pull_head(&fq->q); if (!head) goto out; head->dev = dev; spin_unlock(&fq->q.lock); icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); kfree_skb(head); goto out_rcu_unlock; out: spin_unlock(&fq->q.lock); out_rcu_unlock: rcu_read_unlock(); inet_frag_put(&fq->q); } /* Check if the upper layer header is truncated in the first fragment. */ static inline bool ipv6frag_thdr_truncated(struct sk_buff *skb, int start, u8 *nexthdrp) { u8 nexthdr = *nexthdrp; __be16 frag_off; int offset; offset = ipv6_skip_exthdr(skb, start, &nexthdr, &frag_off); if (offset < 0 || (frag_off & htons(IP6_OFFSET))) return false; switch (nexthdr) { case NEXTHDR_TCP: offset += sizeof(struct tcphdr); break; case NEXTHDR_UDP: offset += sizeof(struct udphdr); break; case NEXTHDR_ICMP: offset += sizeof(struct icmp6hdr); break; default: offset += 1; } if (offset > skb->len) return true; return false; } #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IO_H #define _ASM_X86_IO_H /* * This file contains the definitions for the x86 IO instructions * inb/inw/inl/outb/outw/outl and the "string versions" of the same * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" * versions of the single-IO instructions (inb_p/inw_p/..). * * This file is not meant to be obfuscating: it's just complicated * to (a) handle it all in a way that makes gcc able to optimize it * as well as possible and (b) trying to avoid writing the same thing * over and over again with slight variations and possibly making a * mistake somewhere. */ /* * Thanks to James van Artsdalen for a better timing-fix than * the two short jumps: using outb's to a nonexistent port seems * to guarantee better timings even on fast machines. * * On the other hand, I'd like to be sure of a non-existent port: * I feel a bit unsafe about using 0x80 (should be safe, though) * * Linus */ /* * Bit simplified and optimized by Jan Hubicka * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. * * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, * isa_read[wl] and isa_write[wl] fixed * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #define ARCH_HAS_IOREMAP_WC #define ARCH_HAS_IOREMAP_WT #include <linux/string.h> #include <linux/compiler.h> #include <asm/page.h> #include <asm/early_ioremap.h> #include <asm/pgtable_types.h> #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ :"m" (*(volatile type __force *)addr) barrier); return ret; } #define build_mmio_write(name, size, type, reg, barrier) \ static inline void name(type val, volatile void __iomem *addr) \ { asm volatile("mov" size " %0,%1": :reg (val), \ "m" (*(volatile type __force *)addr) barrier); } build_mmio_read(readb, "b", unsigned char, "=q", :"memory") build_mmio_read(readw, "w", unsigned short, "=r", :"memory") build_mmio_read(readl, "l", unsigned int, "=r", :"memory") build_mmio_read(__readb, "b", unsigned char, "=q", ) build_mmio_read(__readw, "w", unsigned short, "=r", ) build_mmio_read(__readl, "l", unsigned int, "=r", ) build_mmio_write(writeb, "b", unsigned char, "q", :"memory") build_mmio_write(writew, "w", unsigned short, "r", :"memory") build_mmio_write(writel, "l", unsigned int, "r", :"memory") build_mmio_write(__writeb, "b", unsigned char, "q", ) build_mmio_write(__writew, "w", unsigned short, "r", ) build_mmio_write(__writel, "l", unsigned int, "r", ) #define readb readb #define readw readw #define readl readl #define readb_relaxed(a) __readb(a) #define readw_relaxed(a) __readw(a) #define readl_relaxed(a) __readl(a) #define __raw_readb __readb #define __raw_readw __readw #define __raw_readl __readl #define writeb writeb #define writew writew #define writel writel #define writeb_relaxed(v, a) __writeb(v, a) #define writew_relaxed(v, a) __writew(v, a) #define writel_relaxed(v, a) __writel(v, a) #define __raw_writeb __writeb #define __raw_writew __writew #define __raw_writel __writel #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", u64, "=r", :"memory") build_mmio_read(__readq, "q", u64, "=r", ) build_mmio_write(writeq, "q", u64, "r", :"memory") build_mmio_write(__writeq, "q", u64, "r", ) #define readq_relaxed(a) __readq(a) #define writeq_relaxed(v, a) __writeq(v, a) #define __raw_readq __readq #define __raw_writeq __writeq /* Let people know that we have them */ #define readq readq #define writeq writeq #endif #define ARCH_HAS_VALID_PHYS_ADDR_RANGE extern int valid_phys_addr_range(phys_addr_t addr, size_t size); extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); /** * virt_to_phys - map virtual addresses to physical * @address: address to remap * * The returned physical address is the physical (CPU) mapping for * the memory address given. It is only valid to use this function on * addresses directly mapped or allocated via kmalloc. * * This function does not give bus mappings for DMA transfers. In * almost all conceivable cases a device driver should not be using * this function */ static inline phys_addr_t virt_to_phys(volatile void *address) { return __pa(address); } #define virt_to_phys virt_to_phys /** * phys_to_virt - map physical address to virtual * @address: address to remap * * The returned virtual address is a current CPU mapping for * the memory address given. It is only valid to use this function on * addresses that have a kernel mapping * * This function does not handle bus mappings for DMA transfers. In * almost all conceivable cases a device driver should not be using * this function */ static inline void *phys_to_virt(phys_addr_t address) { return __va(address); } #define phys_to_virt phys_to_virt /* * Change "struct page" to physical address. */ #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) /* * ISA I/O bus memory addresses are 1:1 with the physical address. * However, we truncate the address to unsigned int to avoid undesirable * promitions in legacy drivers. */ static inline unsigned int isa_virt_to_bus(volatile void *address) { return (unsigned int)virt_to_phys(address); } #define isa_bus_to_virt phys_to_virt /* * However PCI ones are not necessarily 1:1 and therefore these interfaces * are forbidden in portable PCI drivers. * * Allow them on x86 for legacy drivers, though. */ #define virt_to_bus virt_to_phys #define bus_to_virt phys_to_virt /* * The default ioremap() behavior is non-cached; if you need something * else, you probably want one of the following. */ extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); #define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); #define ioremap_prot ioremap_prot extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); #define ioremap_encrypted ioremap_encrypted /** * ioremap - map bus memory into CPU space * @offset: bus address of the memory * @size: size of the resource to map * * ioremap performs a platform specific sequence of operations to * make bus memory CPU accessible via the readb/readw/readl/writeb/ * writew/writel functions and the other mmio helpers. The returned * address is not guaranteed to be usable directly as a virtual * address. * * If the area you are trying to map is a PCI BAR you should have a * look at pci_iomap(). */ void __iomem *ioremap(resource_size_t offset, unsigned long size); #define ioremap ioremap extern void iounmap(volatile void __iomem *addr); #define iounmap iounmap extern void set_iounmap_nonlazy(void); #ifdef __KERNEL__ void memcpy_fromio(void *, const volatile void __iomem *, size_t); void memcpy_toio(volatile void __iomem *, const void *, size_t); void memset_io(volatile void __iomem *, int, size_t); #define memcpy_fromio memcpy_fromio #define memcpy_toio memcpy_toio #define memset_io memset_io #include <asm-generic/iomap.h> /* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped * to PAGE_OFFSET is pure coincidence - it does not mean ISA values * are physical addresses. The following constant pointer can be * used as the IO-area pointer (it can be iounmapped as well, so the * analogy with PCI is quite large): */ #define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) #endif /* __KERNEL__ */ extern void native_io_delay(void); extern int io_delay_type; extern void io_delay_init(void); #if defined(CONFIG_PARAVIRT) #include <asm/paravirt.h> #else static inline void slow_down_io(void) { native_io_delay(); #ifdef REALLY_SLOW_IO native_io_delay(); native_io_delay(); native_io_delay(); #endif } #endif #ifdef CONFIG_AMD_MEM_ENCRYPT #include <linux/jump_label.h> extern struct static_key_false sev_enable_key; static inline bool sev_key_active(void) { return static_branch_unlikely(&sev_enable_key); } #else /* !CONFIG_AMD_MEM_ENCRYPT */ static inline bool sev_key_active(void) { return false; } #endif /* CONFIG_AMD_MEM_ENCRYPT */ #define BUILDIO(bwl, bw, type) \ static inline void out##bwl(unsigned type value, int port) \ { \ asm volatile("out" #bwl " %" #bw "0, %w1" \ : : "a"(value), "Nd"(port)); \ } \ \ static inline unsigned type in##bwl(int port) \ { \ unsigned type value; \ asm volatile("in" #bwl " %w1, %" #bw "0" \ : "=a"(value) : "Nd"(port)); \ return value; \ } \ \ static inline void out##bwl##_p(unsigned type value, int port) \ { \ out##bwl(value, port); \ slow_down_io(); \ } \ \ static inline unsigned type in##bwl##_p(int port) \ { \ unsigned type value = in##bwl(port); \ slow_down_io(); \ return value; \ } \ \ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ if (sev_key_active()) { \ unsigned type *value = (unsigned type *)addr; \ while (count) { \ out##bwl(*value, port); \ value++; \ count--; \ } \ } else { \ asm volatile("rep; outs" #bwl \ : "+S"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ } \ \ static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ if (sev_key_active()) { \ unsigned type *value = (unsigned type *)addr; \ while (count) { \ *value = in##bwl(port); \ value++; \ count--; \ } \ } else { \ asm volatile("rep; ins" #bwl \ : "+D"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ } BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) #define inb inb #define inw inw #define inl inl #define inb_p inb_p #define inw_p inw_p #define inl_p inl_p #define insb insb #define insw insw #define insl insl #define outb outb #define outw outw #define outl outl #define outb_p outb_p #define outw_p outw_p #define outl_p outl_p #define outsb outsb #define outsw outsw #define outsl outsl extern void *xlate_dev_mem_ptr(phys_addr_t phys); extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define xlate_dev_mem_ptr xlate_dev_mem_ptr #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); #define ioremap_wc ioremap_wc extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); #define ioremap_wt ioremap_wt extern bool is_early_ioremap_ptep(pte_t *ptep); #define IO_SPACE_LIMIT 0xffff #include <asm-generic/io.h> #undef PCI_IOBASE #ifdef CONFIG_MTRR extern int __must_check arch_phys_wc_index(int handle); #define arch_phys_wc_index arch_phys_wc_index extern int __must_check arch_phys_wc_add(unsigned long base, unsigned long size); extern void arch_phys_wc_del(int handle); #define arch_phys_wc_add arch_phys_wc_add #endif #ifdef CONFIG_X86_PAT extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size); extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size); #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif extern bool arch_memremap_can_ram_remap(resource_size_t offset, unsigned long size, unsigned long flags); #define arch_memremap_can_ram_remap arch_memremap_can_ram_remap extern bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size); /** * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units * @dst: destination, in MMIO space (must be 512-bit aligned) * @src: source * @count: number of 512 bits quantities to submit * * Submit data from kernel space to MMIO space, in units of 512 bits at a * time. Order of access is not guaranteed, nor is a memory barrier * performed afterwards. * * Warning: Do not use this helper unless your driver has checked that the CPU * instruction is supported on the platform. */ static inline void iosubmit_cmds512(void __iomem *dst, const void *src, size_t count) { const u8 *from = src; const u8 *end = from + count * 64; while (from < end) { movdir64b(dst, from); from += 64; } } #endif /* _ASM_X86_IO_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/ns_common.h> #include <linux/fs_pin.h> struct mnt_namespace { atomic_t count; struct ns_common ns; struct mount * root; /* * Traversal and modification of .list is protected by either * - taking namespace_sem for write, OR * - taking namespace_sem for read AND taking .ns_lock. */ struct list_head list; spinlock_t ns_lock; struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event; unsigned int mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; } __randomize_layout; struct mnt_pcp { int mnt_count; int mnt_writers; }; struct mountpoint { struct hlist_node m_hash; struct dentry *m_dentry; struct hlist_head m_list; int m_count; }; struct mount { struct hlist_node mnt_hash; struct mount *mnt_parent; struct dentry *mnt_mountpoint; struct vfsmount mnt; union { struct rcu_head mnt_rcu; struct llist_node mnt_llist; }; #ifdef CONFIG_SMP struct mnt_pcp __percpu *mnt_pcp; #else int mnt_count; int mnt_writers; #endif struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list;/* list of slave mounts */ struct list_head mnt_slave; /* slave list entry */ struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ union { struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ struct hlist_node mnt_umount; }; struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; #endif int mnt_id; /* mount identifier */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; struct hlist_head mnt_stuck_children; } __randomize_layout; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ static inline struct mount *real_mount(struct vfsmount *mnt) { return container_of(mnt, struct mount, mnt); } static inline int mnt_has_parent(struct mount *mnt) { return mnt != mnt->mnt_parent; } static inline int is_mounted(struct vfsmount *mnt) { /* neither detached nor internal? */ return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns); } extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); static inline bool __path_is_mountpoint(const struct path *path) { struct mount *m = __lookup_mnt(path->mnt, path->dentry); return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT)); } extern void __detach_mounts(struct dentry *dentry); static inline void detach_mounts(struct dentry *dentry) { if (!d_mountpoint(dentry)) return; __detach_mounts(dentry); } static inline void get_mnt_ns(struct mnt_namespace *ns) { atomic_inc(&ns->count); } extern seqlock_t mount_lock; static inline void lock_mount_hash(void) { write_seqlock(&mount_lock); } static inline void unlock_mount_hash(void) { write_sequnlock(&mount_lock); } struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); struct mount cursor; }; extern const struct seq_operations mounts_op; extern bool __is_local_mountpoint(struct dentry *dentry); static inline bool is_local_mountpoint(struct dentry *dentry) { if (!d_mountpoint(dentry)) return false; return __is_local_mountpoint(dentry); } static inline bool is_anon_ns(struct mnt_namespace *ns) { return ns->seq == 0; } extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_H #define _LINUX_HIGHMEM_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/bug.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <asm/cacheflush.h> #ifndef ARCH_HAS_FLUSH_ANON_PAGE static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { } #endif #ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE static inline void flush_kernel_dcache_page(struct page *page) { } static inline void flush_kernel_vmap_range(void *vaddr, int size) { } static inline void invalidate_kernel_vmap_range(void *vaddr, int size) { } #endif #include <asm/kmap_types.h> #ifdef CONFIG_HIGHMEM extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); extern void kunmap_atomic_high(void *kvaddr); #include <asm/highmem.h> #ifndef ARCH_HAS_KMAP_FLUSH_TLB static inline void kmap_flush_tlb(unsigned long addr) { } #endif #ifndef kmap_prot #define kmap_prot PAGE_KERNEL #endif void *kmap_high(struct page *page); static inline void *kmap(struct page *page) { void *addr; might_sleep(); if (!PageHighMem(page)) addr = page_address(page); else addr = kmap_high(page); kmap_flush_tlb((unsigned long)addr); return addr; } void kunmap_high(struct page *page); static inline void kunmap(struct page *page) { might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); } /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB * invalidation when the kmap pool wraps. * * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. * * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap * gives a more generic (and caching) interface. But kmap_atomic can * be used in IRQ contexts, so in some (very limited) cases we need * it. */ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); return kmap_atomic_high_prot(page, prot); } #define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot) /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; static inline unsigned long totalhigh_pages(void) { return (unsigned long)atomic_long_read(&_totalhigh_pages); } static inline void totalhigh_pages_inc(void) { atomic_long_inc(&_totalhigh_pages); } static inline void totalhigh_pages_dec(void) { atomic_long_dec(&_totalhigh_pages); } static inline void totalhigh_pages_add(long count) { atomic_long_add(count, &_totalhigh_pages); } static inline void totalhigh_pages_set(long val) { atomic_long_set(&_totalhigh_pages, val); } void kmap_flush_unused(void); struct page *kmap_to_page(void *addr); #else /* CONFIG_HIGHMEM */ static inline unsigned int nr_free_highpages(void) { return 0; } static inline struct page *kmap_to_page(void *addr) { return virt_to_page(addr); } static inline unsigned long totalhigh_pages(void) { return 0UL; } static inline void *kmap(struct page *page) { might_sleep(); return page_address(page); } static inline void kunmap_high(struct page *page) { } static inline void kunmap(struct page *page) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(page_address(page)); #endif } static inline void *kmap_atomic(struct page *page) { preempt_disable(); pagefault_disable(); return page_address(page); } #define kmap_atomic_prot(page, prot) kmap_atomic(page) static inline void kunmap_atomic_high(void *addr) { /* * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() * handles re-enabling faults + preemption */ #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(addr); #endif } #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) #define kmap_flush_unused() do {} while(0) #endif /* CONFIG_HIGHMEM */ #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) DECLARE_PER_CPU(int, __kmap_atomic_idx); static inline int kmap_atomic_idx_push(void) { int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1; #ifdef CONFIG_DEBUG_HIGHMEM WARN_ON_ONCE(in_irq() && !irqs_disabled()); BUG_ON(idx >= KM_TYPE_NR); #endif return idx; } static inline int kmap_atomic_idx(void) { return __this_cpu_read(__kmap_atomic_idx) - 1; } static inline void kmap_atomic_idx_pop(void) { #ifdef CONFIG_DEBUG_HIGHMEM int idx = __this_cpu_dec_return(__kmap_atomic_idx); BUG_ON(idx < 0); #else __this_cpu_dec(__kmap_atomic_idx); #endif } #endif /* * Prevent people trying to call kunmap_atomic() as if it were kunmap() * kunmap_atomic() should get the return value of kmap_atomic, not the page. */ #define kunmap_atomic(addr) \ do { \ BUILD_BUG_ON(__same_type((addr), struct page *)); \ kunmap_atomic_high(addr); \ pagefault_enable(); \ preempt_enable(); \ } while (0) /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_atomic(page); clear_user_page(addr, vaddr, page); kunmap_atomic(addr); } #endif #ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE /** * __alloc_zeroed_user_highpage - Allocate a zeroed HIGHMEM page for a VMA with caller-specified movable GFP flags * @movableflags: The GFP flags related to the pages future ability to move like __GFP_MOVABLE * @vma: The VMA the page is to be allocated for * @vaddr: The virtual address the page will be inserted into * * This function will allocate a page for a VMA but the caller is expected * to specify via movableflags whether the page will be movable in the * future or not * * An architecture may override this function by defining * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE and providing their own * implementation. */ static inline struct page * __alloc_zeroed_user_highpage(gfp_t movableflags, struct vm_area_struct *vma, unsigned long vaddr) { struct page *page = alloc_page_vma(GFP_HIGHUSER | movableflags, vma, vaddr); if (page) clear_user_highpage(page, vaddr); return page; } #endif /** * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move * @vma: The VMA the page is to be allocated for * @vaddr: The virtual address the page will be inserted into * * This function will allocate a page for a VMA that the caller knows will * be able to migrate in the future using move_pages() or reclaimed */ static inline struct page * alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, unsigned long vaddr) { return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr); } static inline void clear_highpage(struct page *page) { void *kaddr = kmap_atomic(page); clear_page(kaddr); kunmap_atomic(kaddr); } static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { void *kaddr = kmap_atomic(page); BUG_ON(end1 > PAGE_SIZE || end2 > PAGE_SIZE); if (end1 > start1) memset(kaddr + start1, 0, end1 - start1); if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); kunmap_atomic(kaddr); flush_dcache_page(page); } static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) { zero_user_segments(page, start, end, 0, 0); } static inline void zero_user(struct page *page, unsigned start, unsigned size) { zero_user_segments(page, start, start + size, 0, 0); } #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { char *vfrom, *vto; vfrom = kmap_atomic(from); vto = kmap_atomic(to); copy_user_page(vto, vfrom, vaddr, to); kunmap_atomic(vto); kunmap_atomic(vfrom); } #endif #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; vfrom = kmap_atomic(from); vto = kmap_atomic(to); copy_page(vto, vfrom); kunmap_atomic(vto); kunmap_atomic(vfrom); } #endif #endif /* _LINUX_HIGHMEM_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H #include <linux/idr.h> #include <linux/blk-mq.h> #include <linux/part_stat.h> #include <linux/blk-crypto.h> #include <xen/xen.h> #include "blk-crypto-internal.h" #include "blk-mq.h" #include "blk-mq-sched.h" /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) extern struct dentry *blk_debugfs_root; struct blk_flush_queue { unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; struct list_head flush_data_in_flight; struct request *flush_rq; struct lock_class_key key; spinlock_t mq_flush_lock; }; extern struct kmem_cache *blk_requestq_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; static inline struct blk_flush_queue * blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) { return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; } static inline void __blk_get_queue(struct request_queue *q) { kobject_get(&q->kobj); } bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; if (addr1 + vec1->bv_len != addr2) return false; if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page)) return false; if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask)) return false; return true; } static inline bool __bvec_gap_to_prev(struct request_queue *q, struct bio_vec *bprv, unsigned int offset) { return (offset & queue_virt_boundary(q)) || ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); } /* * Check if adding a bio_vec after bprv with offset would create a gap in * the SG list. Most drivers don't care about this, but some do. */ static inline bool bvec_gap_to_prev(struct request_queue *q, struct bio_vec *bprv, unsigned int offset) { if (!queue_virt_boundary(q)) return false; return __bvec_gap_to_prev(q, bprv, offset); } static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, unsigned int nr_segs) { rq->nr_phys_segments = nr_segs; rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); if (bio->bi_disk) rq->rq_disk = bio->bi_disk; } #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); void bio_integrity_free(struct bio *bio); static inline bool bio_integrity_endio(struct bio *bio) { if (bio_integrity(bio)) return __bio_integrity_endio(bio); return true; } bool blk_integrity_merge_rq(struct request_queue *, struct request *, struct request *); bool blk_integrity_merge_bio(struct request_queue *, struct request *, struct bio *); static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { struct bio_integrity_payload *bip = bio_integrity(req->bio); struct bio_integrity_payload *bip_next = bio_integrity(next); return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } static inline bool integrity_req_gap_front_merge(struct request *req, struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_integrity_payload *bip_next = bio_integrity(req->bio); return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } void blk_integrity_add(struct gendisk *); void blk_integrity_del(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool blk_integrity_merge_rq(struct request_queue *rq, struct request *r1, struct request *r2) { return true; } static inline bool blk_integrity_merge_bio(struct request_queue *rq, struct request *r, struct bio *b) { return true; } static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { return false; } static inline bool integrity_req_gap_front_merge(struct request *req, struct bio *bio) { return false; } static inline void blk_flush_integrity(void) { } static inline bool bio_integrity_endio(struct bio *bio) { return true; } static inline void bio_integrity_free(struct bio *bio) { } static inline void blk_integrity_add(struct gendisk *disk) { } static inline void blk_integrity_del(struct gendisk *disk) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **same_queue_rq); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); void blk_account_io_start(struct request *req); void blk_account_io_done(struct request *req, u64 now); /* * Plug flush limits */ #define BLK_MAX_REQUEST_COUNT 32 #define BLK_PLUG_FLUSH_SIZE (128 * 1024) /* * Internal elevator interface */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) void blk_insert_flush(struct request *rq); void elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); void __elevator_exit(struct request_queue *, struct elevator_queue *); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); static inline void elevator_exit(struct request_queue *q, struct elevator_queue *e) { lockdep_assert_held(&q->sysfs_lock); blk_mq_sched_free_requests(q); __elevator_exit(q, e); } struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); int blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); int blk_dev_init(void); /* * Contribute to IO statistics IFF: * * a) it's attached to a gendisk, and * b) the queue had IO stats enabled when this request was started */ static inline bool blk_do_io_stat(struct request *rq) { return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT); } static inline void req_set_nomerge(struct request_queue *q, struct request *req) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; } /* * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size * is defined as 'unsigned int', meantime it has to aligned to with logical * block size which is the minimum accepted unit by hardware. */ static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) { return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9; } /* * The max bio size which is aligned to q->limits.discard_granularity. This * is a hint to split large discard bio in generic block layer, then if device * driver needs to split the discard bio into smaller ones, their bi_size can * be very probably and easily aligned to discard_granularity of the device's * queue. */ static inline unsigned int bio_aligned_discard_max_sectors( struct request_queue *q) { return round_down(UINT_MAX, q->limits.discard_granularity) >> SECTOR_SHIFT; } /* * Internal io_context interface */ void get_io_context(struct io_context *ioc); struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, gfp_t gfp_mask); void ioc_clear_queue(struct request_queue *q); int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); /* * Internal throttling interface */ #ifdef CONFIG_BLK_DEV_THROTTLING extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); extern void blk_throtl_register_queue(struct request_queue *q); extern void blk_throtl_charge_bio_split(struct bio *bio); bool blk_throtl_bio(struct bio *bio); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } static inline void blk_throtl_charge_bio_split(struct bio *bio) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } #endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, const char *page, size_t count); extern void blk_throtl_bio_endio(struct bio *bio); extern void blk_throtl_stat_add(struct request *rq, u64 time); #else static inline void blk_throtl_bio_endio(struct bio *bio) { } static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } #endif #ifdef CONFIG_BOUNCE extern int init_emergency_isa_pool(void); extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); #else static inline int init_emergency_isa_pool(void) { return 0; } static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) { } #endif /* CONFIG_BOUNCE */ #ifdef CONFIG_BLK_CGROUP_IOLATENCY extern int blk_iolatency_init(struct request_queue *q); #else static inline int blk_iolatency_init(struct request_queue *q) { return 0; } #endif struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp); #ifdef CONFIG_BLK_DEV_ZONED void blk_queue_free_zone_bitmaps(struct request_queue *q); #else static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} #endif struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); int blk_alloc_devt(struct hd_struct *part, dev_t *devt); void blk_free_devt(dev_t devt); void blk_invalidate_devt(dev_t devt); char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 void delete_partition(struct hd_struct *part); int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int disk_expand_part_tbl(struct gendisk *disk, int target); int hd_ref_init(struct hd_struct *part); /* no need to get/put refcount of part0 */ static inline int hd_struct_try_get(struct hd_struct *part) { if (part->partno) return percpu_ref_tryget_live(&part->ref); return 1; } static inline void hd_struct_put(struct hd_struct *part) { if (part->partno) percpu_ref_put(&part->ref); } static inline void hd_free_part(struct hd_struct *part) { free_percpu(part->dkstats); kfree(part->info); percpu_ref_exit(&part->ref); } /* * Any access of part->nr_sects which is not protected by partition * bd_mutex or gendisk bdev bd_mutex, should be done using this * accessor function. * * Code written along the lines of i_size_read() and i_size_write(). * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption * on. */ static inline sector_t part_nr_sects_read(struct hd_struct *part) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) sector_t nr_sects; unsigned seq; do { seq = read_seqcount_begin(&part->nr_sects_seq); nr_sects = part->nr_sects; } while (read_seqcount_retry(&part->nr_sects_seq, seq)); return nr_sects; #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) sector_t nr_sects; preempt_disable(); nr_sects = part->nr_sects; preempt_enable(); return nr_sects; #else return part->nr_sects; #endif } /* * Should be called with mutex lock held (typically bd_mutex) of partition * to provide mutual exlusion among writers otherwise seqcount might be * left in wrong state leaving the readers spinning infinitely. */ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) preempt_disable(); write_seqcount_begin(&part->nr_sects_seq); part->nr_sects = size; write_seqcount_end(&part->nr_sects_seq); preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) preempt_disable(); part->nr_sects = size; preempt_enable(); #else part->nr_sects = size; #endif } int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); #endif /* BLK_INTERNAL_H */
1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 // SPDX-License-Identifier: GPL-2.0-only /* * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * * Support four policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * * interleave Allocate memory interleaved over a set of nodes, * with normal fallback if it fails. * For VMA based allocations this interleaves based on the * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. * * The process policy is applied for most non interrupt memory allocations * in that process' context. Interrupts ignore the policies and always * try to allocate on the local CPU. The VMA policy is only applied for memory * allocations for a VMA in the VM. * * Currently there are a few corner cases in swapping where the policy * is not applied, but the majority should be handled. When process policy * is used it is not remembered over swap outs/swap ins. * * Only the highest zone in the zone hierarchy gets policied. Allocations * requesting a lower zone just use default policy. This implies that * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ /* Notebook: fix mmap readahead to honour policy and enable policy for any page cache object statistics for bigpages global policy for page cache? currently it uses process policy. Requires first item above. handle mremap for shared memory (currently ignored for the policy) grows down? make bind policy root only? It can trigger oom much faster and the kernel is not always grateful with that. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mempolicy.h> #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> #include <asm/tlbflush.h> #include <linux/uaccess.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_PREFERRED, .flags = MPOL_F_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /** * numa_map_to_online_node - Find closest online node * @node: Node id to start the search * * Lookup the next closest node by distance if @nid is not online. */ int numa_map_to_online_node(int node) { int min_dist = INT_MAX, dist, n, min_node; if (node == NUMA_NO_NODE || node_online(node)) return node; min_node = node; for_each_online_node(n) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; min_node = n; } } return min_node; } EXPORT_SYMBOL_GPL(numa_map_to_online_node); struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; int node; if (pol) return pol; node = numa_node_id(); if (node != NUMA_NO_NODE) { pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ if (pol->mode) return pol; } return &default_policy; } static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, const nodemask_t *rel) { nodemask_t tmp; nodes_fold(tmp, *orig, nodes_weight(*rel)); nodes_onto(*ret, tmp, *rel); } static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->v.nodes = *nodes; return 0; } static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { if (!nodes) pol->flags |= MPOL_F_LOCAL; /* local allocation */ else if (nodes_empty(*nodes)) return -EINVAL; /* no allowed nodes */ else pol->v.preferred_node = first_node(*nodes); return 0; } static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->v.nodes = *nodes; return 0; } /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes * parameter with respect to the policy mode and flags. But, we need to * handle an empty nodemask with MPOL_PREFERRED here. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; } if (nodes) ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); else ret = mpol_ops[pol->mode].create(pol, NULL); return ret; } /* * This function just creates a new policy, does some check and simple * initialization. You must invoke mpol_set_nodemask() to set nodes. */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; } VM_BUG_ON(!nodes); /* * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). * All other modes require a valid pointer to a non-empty nodemask. */ if (mode == MPOL_PREFERRED) { if (nodes_empty(*nodes)) { if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; return policy; } /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *p) { if (!atomic_dec_and_test(&p->refcnt)) return; kmem_cache_free(policy_cache, p); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; if (pol->flags & MPOL_F_STATIC_NODES) nodes_and(tmp, pol->w.user_nodemask, *nodes); else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } if (nodes_empty(tmp)) tmp = *nodes; pol->v.nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; if (pol->flags & MPOL_F_STATIC_NODES) { int node = first_node(pol->w.user_nodemask); if (node_isset(node, *nodes)) { pol->v.preferred_node = node; pol->flags &= ~MPOL_F_LOCAL; } else pol->flags |= MPOL_F_LOCAL; } else if (pol->flags & MPOL_F_RELATIVE_NODES) { mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); pol->v.preferred_node = first_node(tmp); } else if (!(pol->flags & MPOL_F_LOCAL)) { pol->v.preferred_node = node_remap(pol->v.preferred_node, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } } /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * * Per-vma policies are protected by mmap_lock. Allocations using per-task * policies are protected by task->mems_allowed_seq to prevent a premature * OOM/allocation failure due to parallel nodemask modification. */ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; mpol_ops[pol->mode].rebind(pol, newmask); } /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. * * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); } /* * Rebind each vma in mm to new nodemask. * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; mmap_write_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) mpol_rebind_policy(vma->vm_policy, new); mmap_write_unlock(mm); } static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_DEFAULT] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { .create = mpol_new_interleave, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { .create = mpol_new_bind, .rebind = mpol_rebind_nodemask, }, }; static int migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; unsigned long start; unsigned long end; struct vm_area_struct *first; }; /* * Check if the page's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ static inline bool queue_pages_required(struct page *page, struct queue_pages *qp) { int nid = page_to_nid(page); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } /* * queue_pages_pmd() has four possible return values: * 0 - pages are placed on the right node or queued successfully. * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * 2 - THP was split. * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an * existing page was already on a node that does not follow the * policy. */ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) __releases(ptl) { int ret = 0; struct page *page; struct queue_pages *qp = walk->private; unsigned long flags; if (unlikely(is_pmd_migration_entry(*pmd))) { ret = -EIO; goto unlock; } page = pmd_page(*pmd); if (is_huge_zero_page(page)) { spin_unlock(ptl); __split_huge_pmd(walk->vma, pmd, addr, false, NULL); ret = 2; goto out; } if (!queue_pages_required(page, qp)) goto unlock; flags = qp->flags; /* go to thp migration */ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || migrate_page_add(page, qp->pagelist, flags)) { ret = 1; goto unlock; } } else ret = -EIO; unlock: spin_unlock(ptl); out: return ret; } /* * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. * * queue_pages_pte_range() has three possible return values: * 0 - pages are placed on the right node or queued successfully. * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * -EIO - only MPOL_MF_STRICT was specified and an existing page was already * on a node that does not follow the policy. */ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct page *page; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; int ret; bool has_unmovable = false; pte_t *pte, *mapped_pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { ret = queue_pages_pmd(pmd, ptl, addr, end, walk); if (ret != 2) return ret; } /* THP was split, fall through to pte walk */ if (pmd_trans_unstable(pmd)) return 0; mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); if (!page) continue; /* * vm_normal_page() filters out zero pages, but there might * still be PageReserved pages to skip, perhaps in a VDSO. */ if (PageReserved(page)) continue; if (!queue_pages_required(page, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ if (!vma_migratable(vma)) { has_unmovable = true; break; } /* * Do not abort immediately since there may be * temporary off LRU pages in the range. Still * need migrate other LRU pages. */ if (migrate_page_add(page, qp->pagelist, flags)) has_unmovable = true; } else break; } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); if (has_unmovable) return 1; return addr != end ? -EIO : 0; } static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { int ret = 0; #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = (qp->flags & MPOL_MF_VALID); struct page *page; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(pte); if (!pte_present(entry)) goto unlock; page = pte_page(entry); if (!queue_pages_required(page, qp)) goto unlock; if (flags == MPOL_MF_STRICT) { /* * STRICT alone means only detecting misplaced page and no * need to further check other vma. */ ret = -EIO; goto unlock; } if (!vma_migratable(walk->vma)) { /* * Must be STRICT with MOVE*, otherwise .test_walk() have * stopped walking current vma. * Detecting misplaced page but allow migrating pages which * have been queued. */ ret = 1; goto unlock; } /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { if (!isolate_huge_page(page, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate page but allow migrating pages * which have been queued. */ ret = 1; } unlock: spin_unlock(ptl); #else BUG(); #endif return ret; } #ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these * faults, pages may be migrated for better NUMA placement. * * This is assuming that NUMA faults are handled using PROT_NONE. If * an architecture makes a different choice, it will need further * changes to the core. */ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { int nr_updated; nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); return nr_updated; } #else static unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; /* range check first */ VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma); if (!qp->first) { qp->first = vma; if (!(flags & MPOL_MF_DISCONTIG_OK) && (qp->start < vma->vm_start)) /* hole at head side of range */ return -EFAULT; } if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable */ if (!vma_migratable(vma) && !(flags & MPOL_MF_STRICT)) return 1; if (endvma > end) endvma = end; if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && !(vma->vm_flags & VM_MIXEDMAP)) change_prot_numa(vma, start, endvma); return 1; } /* queue pages from current vma */ if (flags & MPOL_MF_VALID) return 0; return 1; } static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_pages_hugetlb, .pmd_entry = queue_pages_pte_range, .test_walk = queue_pages_test_walk, }; /* * Walk through page tables and collect pages to be migrated. * * If pages found in a given range are on a set of nodes (determined by * @nodes and @flags,) it's isolated and queued to the pagelist which is * passed via @private. * * queue_pages_range() has three possible return values: * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * 0 - queue pages successfully or no misplaced page. * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or * memory range specified by nodemask and maxnode points outside * your accessible address space (-EFAULT) */ static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, .start = start, .end = end, .first = NULL, }; err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); if (!qp.first) /* whole range in hole */ err = -EFAULT; return err; } /* * Apply policy to a single VMA * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct mempolicy *old; struct mempolicy *new; pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", vma->vm_start, vma->vm_end, vma->vm_pgoff, vma->vm_ops, vma->vm_file, vma->vm_ops ? vma->vm_ops->set_policy : NULL); new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); if (err) goto err_out; } old = vma->vm_policy; vma->vm_policy = new; /* protected by mmap_lock */ mpol_put(old); return 0; err_out: mpol_put(new); return err; } /* Step 2: apply policy to a range and do splits. */ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { struct vm_area_struct *next; struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; unsigned long vmstart; unsigned long vmend; vma = find_vma(mm, start); VM_BUG_ON(!vma); prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); if (mpol_equal(vma_policy(vma), new_pol)) continue; pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx); if (prev) { vma = prev; next = vma->vm_next; if (mpol_equal(vma_policy(vma), new_pol)) continue; /* vma_merge() joined vma && vma->next, case 8 */ goto replace; } if (vma->vm_start != vmstart) { err = split_vma(vma->vm_mm, vma, vmstart, 1); if (err) goto out; } if (vma->vm_end != vmend) { err = split_vma(vma->vm_mm, vma, vmend, 0); if (err) goto out; } replace: err = vma_replace_policy(vma, new_pol); if (err) goto out; } out: return err; } /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; NODEMASK_SCRATCH(scratch); int ret; if (!scratch) return -ENOMEM; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) { ret = PTR_ERR(new); goto out; } ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { mpol_put(new); goto out; } task_lock(current); old = current->mempolicy; current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE) current->il_prev = MAX_NUMNODES-1; task_unlock(current); mpol_put(old); ret = 0; out: NODEMASK_SCRATCH_FREE(scratch); return ret; } /* * Return nodemask for policy for get_mempolicy() query * * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) { nodes_clear(*nodes); if (p == &default_policy) return; switch (p->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: *nodes = p->v.nodes; break; case MPOL_PREFERRED: if (!(p->flags & MPOL_F_LOCAL)) node_set(p->v.preferred_node, *nodes); /* else return empty node mask for local allocation */ break; default: BUG(); } } static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; int err; int locked = 1; err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); if (err > 0) { err = page_to_nid(p); put_page(p); } if (locked) mmap_read_unlock(mm); return err; } /* Retrieve NUMA policy */ static long do_get_mempolicy(int *policy, nodemask_t *nmask, unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; if (flags & MPOL_F_MEMS_ALLOWED) { if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ task_lock(current); *nmask = cpuset_current_mems_allowed; task_unlock(current); return 0; } if (flags & MPOL_F_ADDR) { /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ mmap_read_lock(mm); vma = find_vma_intersection(mm, addr, addr+1); if (!vma) { mmap_read_unlock(mm); return -EFAULT; } if (vma->vm_ops && vma->vm_ops->get_policy) pol = vma->vm_ops->get_policy(vma, addr); else pol = vma->vm_policy; } else if (addr) return -EINVAL; if (!pol) pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, lookup_node() * wil drop the mmap_lock, so after calling * lookup_node() only "pol" remains valid, "vma" * is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->v.nodes); } else { err = -EINVAL; goto out; } } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing * the policy to userspace. */ *policy |= (pol->flags & MPOL_MODE_FLAGS); } err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { *nmask = pol->w.user_nodemask; } else { task_lock(current); get_policy_nodemask(pol, nmask); task_unlock(current); } } out: mpol_cond_put(pol); if (vma) mmap_read_unlock(mm); if (pol_refcount) mpol_put(pol_refcount); return err; } #ifdef CONFIG_MIGRATION /* * page migration, thp tail pages can be passed. */ static int migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { struct page *head = compound_head(page); /* * Avoid migrating a page that is shared with others. */ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) { if (!isolate_lru_page(head)) { list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + page_is_file_lru(head), thp_nr_pages(head)); } else if (flags & MPOL_MF_STRICT) { /* * Non-movable page may reach here. And, there may be * temporary off LRU pages or non-LRU movable pages. * Treat them as unmovable pages since they can't be * isolated, so they can't be moved at the moment. It * should return -EIO for this case too. */ return -EIO; } } return 0; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, }; nodes_clear(nmask); node_set(source, nmask); /* * This does not "check" the range but isolates all pages that * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); } return err; } /* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. * * Returns the number of page that could not be moved. */ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { int busy = 0; int err; nodemask_t tmp; err = migrate_prep(); if (err) return err; mmap_read_lock(mm); /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' * bit in 'tmp', and return that <source, dest> pair for migration. * The pair of nodemasks 'to' and 'from' define the map. * * If no pair of bits is found that way, fallback to picking some * pair of 'source' and 'dest' bits that are not the same. If the * 'source' and 'dest' bits are the same, this represents a node * that will be migrating to itself, so no pages need move. * * If no bits are left in 'tmp', or if all remaining bits left * in 'tmp' correspond to the same bit in 'to', return false * (nothing left to migrate). * * This lets us pick a pair of nodes to migrate between, such that * if possible the dest node is not already occupied by some other * source node, minimizing the risk of overloading the memory on a * node that would happen if we migrated incoming memory to a node * before migrating outgoing memory source that same node. * * A single scan of tmp is sufficient. As we go, we remember the * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. */ tmp = *from; while (!nodes_empty(tmp)) { int s,d; int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { /* * do_migrate_pages() tries to maintain the relative * node relationship of the pages established between * threads and memory areas. * * However if the number of source nodes is not equal to * the number of destination nodes we can not preserve * this node relative relationship. In that case, skip * copying memory from a node that is in the destination * mask. * * Example: [2,3,4] -> [3,4,5] moves everything. * [0-7] - > [3,4,5] moves only 0,1,2,6,7. */ if ((nodes_weight(*from) != nodes_weight(*to)) && (node_isset(s, *to))) continue; d = node_remap(s, *from, *to); if (s == d) continue; source = s; /* Node moved. Memorize */ dest = d; /* dest not in remaining from nodes? */ if (!node_isset(dest, tmp)) break; } if (source == NUMA_NO_NODE) break; node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) busy += err; if (err < 0) break; } mmap_read_unlock(mm); if (err < 0) return err; return busy; } /* * Allocate a new page for page migration based on vma policy. * Start by assuming the page is mapped by the same vma as contains @start. * Search forward from there, if not. N.B., this assumes that the * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ static struct page *new_page(struct page *page, unsigned long start) { struct vm_area_struct *vma; unsigned long address; vma = find_vma(current->mm, start); while (vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) break; vma = vma->vm_next; } if (PageHuge(page)) { return alloc_huge_page_vma(page_hstate(compound_head(page)), vma, address); } else if (PageTransHuge(page)) { struct page *thp; thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, HPAGE_PMD_ORDER); if (!thp) return NULL; prep_transhuge_page(thp); return thp; } /* * if !vma, alloc_page_vma() will use task or system default policy */ return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL, vma, address); } #else static int migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { return -EIO; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return -ENOSYS; } static struct page *new_page(struct page *page, unsigned long start) { return NULL; } #endif static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; int err; int ret; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = (len + PAGE_SIZE - 1) & PAGE_MASK; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); if (flags & MPOL_MF_LAZY) new->flags |= MPOL_F_MOF; /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all */ if (!new) flags |= MPOL_MF_DISCONTIG_OK; pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", start, start + len, mode, mode_flags, nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { err = migrate_prep(); if (err) goto mpol_out; } { NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); err = mpol_set_nodemask(new, nmask, scratch); if (err) mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } if (err) goto mpol_out; ret = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); if (ret < 0) { err = ret; goto up_out; } err = mbind_range(mm, start, end, new); if (!err) { int nr_failed = 0; if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_page, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_movable_pages(&pagelist); } if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT))) err = -EIO; } else { up_out: if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); } mmap_write_unlock(mm); mpol_out: mpol_put(new); return err; } /* * User space interface with variable sized bitmaps for nodelists. */ /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long k; unsigned long t; unsigned long nlongs; unsigned long endmask; --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; nlongs = BITS_TO_LONGS(maxnode); if ((maxnode % BITS_PER_LONG) == 0) endmask = ~0UL; else endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; /* * When the user specified more nodes than supported just check * if the non supported part is all zero. * * If maxnode have more longs than MAX_NUMNODES, check * the bits in that area first. And then go through to * check the rest bits which equal or bigger than MAX_NUMNODES. * Otherwise, just check bits [MAX_NUMNODES, maxnode). */ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { if (get_user(t, nmask + k)) return -EFAULT; if (k == nlongs - 1) { if (t & endmask) return -EINVAL; } else if (t) return -EINVAL; } nlongs = BITS_TO_LONGS(MAX_NUMNODES); endmask = ~0UL; } if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) { unsigned long valid_mask = endmask; valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); if (get_user(t, nmask + nlongs - 1)) return -EFAULT; if (t & valid_mask) return -EINVAL; } if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) return -EFAULT; nodes_addr(*nodes)[nlongs-1] &= endmask; return 0; } /* Copy a kernel node mask to user space */ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); if (copy > nbytes) { if (copy > PAGE_SIZE) return -EINVAL; if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; } return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { nodemask_t nodes; int err; unsigned short mode_flags; start = untagged_addr(start); mode_flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if (mode >= MPOL_MAX) return -EINVAL; if ((mode_flags & MPOL_F_STATIC_NODES) && (mode_flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_mbind(start, len, mode, mode_flags, &nodes, flags); } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) { return kernel_mbind(start, len, mode, nmask, maxnode, flags); } /* Set the process memory policy */ static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { int err; nodemask_t nodes; unsigned short flags; flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if ((unsigned int)mode >= MPOL_MAX) return -EINVAL; if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_set_mempolicy(mode, flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { return kernel_set_mempolicy(mode, nmask, maxnode); } static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) { struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; int err; nodemask_t *old; nodemask_t *new; NODEMASK_SCRATCH(scratch); if (!scratch) return -ENOMEM; old = &scratch->mask1; new = &scratch->mask2; err = get_nodes(old, old_nodes, maxnode); if (err) goto out; err = get_nodes(new, new_nodes, maxnode); if (err) goto out; /* Find the mm_struct */ rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { rcu_read_unlock(); err = -ESRCH; goto out; } get_task_struct(task); err = -EINVAL; /* * Check if this process has the right to modify the specified process. * Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; } rcu_read_unlock(); task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out_put; } task_nodes = cpuset_mems_allowed(current); nodes_and(*new, *new, task_nodes); if (nodes_empty(*new)) goto out_put; err = security_task_movememory(task); if (err) goto out_put; mm = get_task_mm(task); put_task_struct(task); if (!mm) { err = -EINVAL; goto out; } err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); mmput(mm); out: NODEMASK_SCRATCH_FREE(scratch); return err; out_put: put_task_struct(task); goto out; } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, unsigned long flags) { int err; int pval; nodemask_t nodes; if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; addr = untagged_addr(addr); err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) return err; if (policy && put_user(pval, policy)) return -EFAULT; if (nmask) err = copy_nodes_to_user(nmask, maxnode, &nodes); return err; } SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long __user *, nmask, unsigned long, maxnode, unsigned long, addr, unsigned long, flags) { return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode, compat_ulong_t, addr, compat_ulong_t, flags) { long err; unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) nm = compat_alloc_user_space(alloc_size); err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags); if (!err && nmask) { unsigned long copy_size; copy_size = min_t(unsigned long, sizeof(bm), alloc_size); err = copy_from_user(bm, nm, copy_size); /* ensure entire bitmap is zeroed */ err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); err |= compat_put_bitmap(nmask, bm, nr_bits); } return err; } COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode) { unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) { if (compat_get_bitmap(bm, nmask, nr_bits)) return -EFAULT; nm = compat_alloc_user_space(alloc_size); if (copy_to_user(nm, bm, alloc_size)) return -EFAULT; } return kernel_set_mempolicy(mode, nm, nr_bits+1); } COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, compat_ulong_t, mode, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode, compat_ulong_t, flags) { unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; nodemask_t bm; nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) { if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits)) return -EFAULT; nm = compat_alloc_user_space(alloc_size); if (copy_to_user(nm, nodes_addr(bm), alloc_size)) return -EFAULT; } return kernel_mbind(start, len, mode, nm, nr_bits+1, flags); } COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, compat_ulong_t, maxnode, const compat_ulong_t __user *, old_nodes, const compat_ulong_t __user *, new_nodes) { unsigned long __user *old = NULL; unsigned long __user *new = NULL; nodemask_t tmp_mask; unsigned long nr_bits; unsigned long size; nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (old_nodes) { if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) return -EFAULT; old = compat_alloc_user_space(new_nodes ? size * 2 : size); if (new_nodes) new = old + size / sizeof(unsigned long); if (copy_to_user(old, nodes_addr(tmp_mask), size)) return -EFAULT; } if (new_nodes) { if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) return -EFAULT; if (new == NULL) new = compat_alloc_user_space(size); if (copy_to_user(new, nodes_addr(tmp_mask), size)) return -EFAULT; } return kernel_migrate_pages(pid, nr_bits + 1, old, new); } #endif /* CONFIG_COMPAT */ bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; /* * DAX device mappings require predictable access latency, so avoid * incurring periodic faults. */ if (vma_is_dax(vma)) return false; if (is_vm_hugetlb_page(vma) && !hugepage_migration_supported(hstate_vma(vma))) return false; /* * Migration allocates pages in the highest zone. If we cannot * do so then migration (at least from node to node) is not * possible. */ if (vma->vm_file && gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) < policy_zone) return false; return true; } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = NULL; if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { pol = vma->vm_ops->get_policy(vma, addr); } else if (vma->vm_policy) { pol = vma->vm_policy; /* * shmem_alloc_page() passes MPOL_F_SHARED policy with * a pseudo vma whose vma->vm_ops=NULL. Take a reference * count on these policies which will be dropped by * mpol_cond_put() later */ if (mpol_needs_cond_ref(pol)) mpol_get(pol); } } return pol; } /* * get_vma_policy(@vma, @addr) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = __get_vma_policy(vma, addr); if (!pol) pol = get_task_policy(current); return pol; } bool vma_policy_mof(struct vm_area_struct *vma) { struct mempolicy *pol; if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; pol = vma->vm_ops->get_policy(vma, vma->vm_start); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); return ret; } pol = vma->vm_policy; if (!pol) pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); /* * if policy->v.nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->v.nodes is intersect with node_states[N_MEMORY]. * so if the following test faile, it implies * policy->v.nodes has movable memory only. */ if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) dynamic_policy_zone = ZONE_MOVABLE; return zone >= dynamic_policy_zone; } /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation */ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { /* Lower zones don't get a nodemask applied for MPOL_BIND */ if (unlikely(policy->mode == MPOL_BIND) && apply_policy_zone(policy, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) return &policy->v.nodes; return NULL; } /* Return the node id preferred by the given mempolicy, or the given id */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) nd = policy->v.preferred_node; else { /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the * requested node and not break the policy. */ WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } return nd; } /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { unsigned next; struct task_struct *me = current; next = next_node_in(me->il_prev, policy->v.nodes); if (next < MAX_NUMNODES) me->il_prev = next; return next; } /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. */ unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; int node = numa_mem_id(); if (in_interrupt()) return node; policy = current->mempolicy; if (!policy || policy->flags & MPOL_F_LOCAL) return node; switch (policy->mode) { case MPOL_PREFERRED: /* * handled MPOL_F_LOCAL above */ return policy->v.preferred_node; case MPOL_INTERLEAVE: return interleave_nodes(policy); case MPOL_BIND: { struct zoneref *z; /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes); return z->zone ? zone_to_nid(z->zone) : node; } default: BUG(); } } /* * Do static interleaving for a VMA with known offset @n. Returns the n'th * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the * number of present nodes. */ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; int i; int nid; if (!nnodes) return numa_node_id(); target = (unsigned int)n % nnodes; nid = first_node(pol->v.nodes); for (i = 0; i < target; i++) nid = next_node(nid, pol->v.nodes); return nid; } /* Determine a node number for interleave */ static inline unsigned interleave_nid(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long addr, int shift) { if (vma) { unsigned long off; /* * for small pages, there is no difference between * shift and PAGE_SHIFT, so the bit-shift is safe. * for huge pages, since vm_pgoff is in units of small * pages, we need to shift off the always 0 bits to get * a useful offset. */ BUG_ON(shift < PAGE_SHIFT); off = vma->vm_pgoff >> (shift - PAGE_SHIFT); off += (addr - vma->vm_start) >> shift; return offset_il_node(pol, off); } else return interleave_nodes(pol); } #ifdef CONFIG_HUGETLBFS /* * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { int nid; *mpol = get_vma_policy(vma, addr); *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { nid = interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))); } else { nid = policy_node(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } return nid; } /* * init_nodemask_of_mempolicy * * If the current task's mempolicy is "default" [NULL], return 'false' * to indicate default policy. Otherwise, extract the policy nodemask * for 'bind' or 'interleave' policy into the argument nodemask, or * initialize the argument nodemask to contain the single node for * 'preferred' or 'local' policy and return 'true' to indicate presence * of non-default mempolicy. * * We don't bother with reference counting the mempolicy [mpol_get/put] * because the current task is examining it's own mempolicy and a task's * mempolicy is only ever changed by the task itself. * * N.B., it is the caller's responsibility to free a returned nodemask. */ bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; int nid; if (!(mask && current->mempolicy)) return false; task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: if (mempolicy->flags & MPOL_F_LOCAL) nid = numa_node_id(); else nid = mempolicy->v.preferred_node; init_nodemask_of_node(mask, nid); break; case MPOL_BIND: case MPOL_INTERLEAVE: *mask = mempolicy->v.nodes; break; default: BUG(); } task_unlock(current); return true; } #endif /* * mempolicy_nodemask_intersects * * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default * policy. Otherwise, check for intersection between mask and the policy * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' * policy, always return true since it may allocate elsewhere on fallback. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. */ bool mempolicy_nodemask_intersects(struct task_struct *tsk, const nodemask_t *mask) { struct mempolicy *mempolicy; bool ret = true; if (!mask) return ret; task_lock(tsk); mempolicy = tsk->mempolicy; if (!mempolicy) goto out; switch (mempolicy->mode) { case MPOL_PREFERRED: /* * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to * allocate from, they may fallback to other nodes when oom. * Thus, it's possible for tsk to have allocated memory from * nodes in mask. */ break; case MPOL_BIND: case MPOL_INTERLEAVE: ret = nodes_intersects(mempolicy->v.nodes, *mask); break; default: BUG(); } out: task_unlock(tsk); return ret; } /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) { struct page *page; page = __alloc_pages(gfp, order, nid); /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ if (!static_branch_likely(&vm_numa_stat_key)) return page; if (page && page_to_nid(page) == nid) { preempt_disable(); __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); preempt_enable(); } return page; } /** * alloc_pages_vma - Allocate a page for a VMA. * * @gfp: * %GFP_USER user allocation. * %GFP_KERNEL kernel allocations, * %GFP_HIGHMEM highmem/user allocations, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @node: Which node to prefer for allocation (modulo policy). * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. * When VMA is not NULL caller must read-lock the mmap_lock of the * mm_struct of the VMA to prevent it from going away. Should be used for * all allocations for pages that will be mapped into user space. Returns * NULL when no page can be allocated. */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage) { struct mempolicy *pol; struct page *page; int preferred_nid; nodemask_t *nmask; pol = get_vma_policy(vma, addr); if (pol->mode == MPOL_INTERLEAVE) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); goto out; } if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { int hpage_node = node; /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred * node) we only try to allocate from the current/preferred * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * * If the policy is interleave, or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) hpage_node = pol->v.preferred_node; nmask = policy_nodemask(gfp, pol); if (!nmask || node_isset(hpage_node, *nmask)) { mpol_cond_put(pol); /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ page = __alloc_pages_node(hpage_node, gfp | __GFP_THISNODE | __GFP_NORETRY, order); /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ if (!page && (gfp & __GFP_DIRECT_RECLAIM)) page = __alloc_pages_nodemask(gfp, order, hpage_node, nmask); goto out; } } nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: return page; } EXPORT_SYMBOL(alloc_pages_vma); /** * alloc_pages_current - Allocate pages. * * @gfp: * %GFP_USER user allocation, * %GFP_KERNEL kernel allocation, * %GFP_HIGHMEM highmem allocation, * %GFP_FS don't call back into a file system. * %GFP_ATOMIC don't sleep. * @order: Power of two of allocation size in pages. 0 is a single page. * * Allocate a page from the kernel page pool. When not in * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; struct page *page; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); return page; } EXPORT_SYMBOL(alloc_pages_current); int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(vma_policy(src)); if (IS_ERR(pol)) return PTR_ERR(pol); dst->vm_policy = pol; return 0; } /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). * * current's mempolicy may be rebinded by the other task(the task that changes * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* task's mempolicy is protected by alloc_lock */ if (old == current->mempolicy) { task_lock(current); *new = *old; task_unlock(current); } else *new = *old; if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; } /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) return false; if (a->mode != b->mode) return false; if (a->flags != b->flags) return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: /* a's ->flags is the same as b's */ if (a->flags & MPOL_F_LOCAL) return true; return a->v.preferred_node == b->v.preferred_node; default: BUG(); return false; } } /* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ /* * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { struct rb_node *n = sp->root.rb_node; while (n) { struct sp_node *p = rb_entry(n, struct sp_node, nd); if (start >= p->end) n = n->rb_right; else if (end <= p->start) n = n->rb_left; else break; } if (!n) return NULL; for (;;) { struct sp_node *w = NULL; struct rb_node *prev = rb_prev(n); if (!prev) break; w = rb_entry(prev, struct sp_node, nd); if (w->end <= start) break; n = prev; } return rb_entry(n, struct sp_node, nd); } /* * Insert a new shared policy into the list. Caller holds sp->lock for * writing. */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; struct rb_node *parent = NULL; struct sp_node *nd; while (*p) { parent = *p; nd = rb_entry(parent, struct sp_node, nd); if (new->start < nd->start) p = &(*p)->rb_left; else if (new->end > nd->end) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, new->policy ? new->policy->mode : 0); } /* Find shared policy intersecting idx */ struct mempolicy * mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) { struct mempolicy *pol = NULL; struct sp_node *sn; if (!sp->root.rb_node) return NULL; read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } read_unlock(&sp->lock); return pol; } static void sp_free(struct sp_node *n) { mpol_put(n->policy); kmem_cache_free(sn_cache, n); } /** * mpol_misplaced - check whether current page node is valid in policy * * @page: page to be checked * @vma: vm area where page mapped * @addr: virtual address where page mapped * * Lookup current policy node id for vma,addr and "compare to" page's * node id. * * Returns: * -1 - not misplaced, page is in the right node * node - node id where the page should be * * Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. */ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; struct zoneref *z; int curnid = page_to_nid(page); unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; int ret = -1; pol = get_vma_policy(vma, addr); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: pgoff = vma->vm_pgoff; pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; polnid = offset_il_node(pol, pgoff); break; case MPOL_PREFERRED: if (pol->flags & MPOL_F_LOCAL) polnid = numa_node_id(); else polnid = pol->v.preferred_node; break; case MPOL_BIND: /* * allows binding to multiple nodes. * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. */ if (node_isset(curnid, pol->v.nodes)) goto out; z = first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->v.nodes); polnid = zone_to_nid(z->zone); break; default: BUG(); } /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) goto out; } if (curnid != polnid) ret = polnid; out: mpol_cond_put(pol); return ret; } /* * Drop the (possibly final) reference to task->mempolicy. It needs to be * dropped after task->mempolicy is set to NULL so that any allocation done as * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed * policy. */ void mpol_put_task_policy(struct task_struct *task) { struct mempolicy *pol; task_lock(task); pol = task->mempolicy; task->mempolicy = NULL; task_unlock(task); mpol_put(pol); } static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); sp_free(n); } static void sp_node_init(struct sp_node *node, unsigned long start, unsigned long end, struct mempolicy *pol) { node->start = start; node->end = end; node->policy = pol; } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { struct sp_node *n; struct mempolicy *newpol; n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; newpol = mpol_dup(pol); if (IS_ERR(newpol)) { kmem_cache_free(sn_cache, n); return NULL; } newpol->flags |= MPOL_F_SHARED; sp_node_init(n, start, end, newpol); return n; } /* Replace a policy range. */ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; struct mempolicy *mpol_new = NULL; int ret = 0; restart: write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { struct rb_node *next = rb_next(&n->nd); if (n->start >= start) { if (n->end <= end) sp_delete(sp, n); else n->start = end; } else { /* Old policy spanning whole new range. */ if (n->end > end) { if (!n_new) goto alloc_new; *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); n_new = NULL; mpol_new = NULL; break; } else n->end = start; } if (!next) break; n = rb_entry(next, struct sp_node, nd); } if (new) sp_insert(sp, new); write_unlock(&sp->lock); ret = 0; err_out: if (mpol_new) mpol_put(mpol_new); if (n_new) kmem_cache_free(sn_cache, n_new); return ret; alloc_new: write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) goto err_out; mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!mpol_new) goto err_out; goto restart; } /** * mpol_shared_policy_init - initialize shared policy for inode * @sp: pointer to inode shared policy * @mpol: struct mempolicy to install * * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ rwlock_init(&sp->lock); if (mpol) { struct vm_area_struct pvma; struct mempolicy *new; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(new)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) goto put_new; /* Create pseudo-vma that contains just the policy */ vma_init(&pvma, NULL); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ put_new: mpol_put(new); /* drop initial ref */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } int mpol_set_shared_policy(struct shared_policy *info, struct vm_area_struct *vma, struct mempolicy *npol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", vma->vm_pgoff, sz, npol ? npol->mode : -1, npol ? npol->flags : -1, npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); if (!new) return -ENOMEM; } err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *p) { struct sp_node *n; struct rb_node *next; if (!p->root.rb_node) return; write_lock(&p->lock); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(p, n); } write_unlock(&p->lock); } #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { bool numabalancing_default = false; if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } static int __init setup_numabalancing(char *str) { int ret = 0; if (!str) goto out; if (!strcmp(str, "enable")) { numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { numabalancing_override = -1; ret = 1; } out: if (!ret) pr_warn("Unable to parse numa_balancing=\n"); return ret; } __setup("numa_balancing=", setup_numabalancing); #else static inline void __init check_numabalancing_enable(void) { } #endif /* CONFIG_NUMA_BALANCING */ /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; unsigned long largest = 0; int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), 0, SLAB_PANIC, NULL); for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .v = { .preferred_node = nid, }, }; } /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ if (largest < total_pages) { largest = total_pages; prefer = nid; } /* Interleave this node? */ if ((total_pages << PAGE_SHIFT) >= (16 << 20)) node_set(nid, interleave_nodes); } /* All too small, use the largest */ if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } /* Reset policy of current process to default */ void numa_default_policy(void) { do_set_mempolicy(MPOL_DEFAULT, 0, NULL); } /* * Parse and format mempolicy from/to strings */ /* * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", }; #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. * * Format of input: * <mode>[=<flags>][:<nodelist>] * * On success, returns 0, else 1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); int err = 1, mode; if (flags) *flags++ = '\0'; /* terminate mode string */ if (nodelist) { /* NUL-terminate mode or flags string */ *nodelist++ = '\0'; if (nodelist_parse(nodelist, nodes)) goto out; if (!nodes_subset(nodes, node_states[N_MEMORY])) goto out; } else nodes_clear(nodes); mode = match_string(policy_modes, MPOL_MAX, str); if (mode < 0) goto out; switch (mode) { case MPOL_PREFERRED: /* * Insist on a nodelist of one node only, although later * we use first_node(nodes) to grab a single node, so here * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; while (isdigit(*rest)) rest++; if (*rest) goto out; if (nodes_empty(nodes)) goto out; } break; case MPOL_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ if (!nodelist) nodes = node_states[N_MEMORY]; break; case MPOL_LOCAL: /* * Don't allow a nodelist; mpol_new() checks flags */ if (nodelist) goto out; mode = MPOL_PREFERRED; break; case MPOL_DEFAULT: /* * Insist on a empty nodelist */ if (!nodelist) err = 0; goto out; case MPOL_BIND: /* * Insist on a nodelist */ if (!nodelist) goto out; } mode_flags = 0; if (flags) { /* * Currently, we only support two mutually exclusive * mode flags. */ if (!strcmp(flags, "static")) mode_flags |= MPOL_F_STATIC_NODES; else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) goto out; /* * Save nodes for mpol_to_str() to show the tmpfs mount options * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. */ if (mode != MPOL_PREFERRED) new->v.nodes = nodes; else if (nodelist) new->v.preferred_node = first_node(nodes); else new->flags |= MPOL_F_LOCAL; /* * Save nodes for contextualization: this will be used to "clone" * the mempolicy in a specific context [cpuset] at a later time. */ new->w.user_nodemask = nodes; err = 0; out: /* Restore string for error message */ if (nodelist) *--nodelist = ':'; if (flags) *--flags = '='; if (!err) *mpol = new; return err; } #endif /* CONFIG_TMPFS */ /** * mpol_to_str - format a mempolicy structure for printing * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the * longest flag, "relative", and to display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; nodemask_t nodes = NODE_MASK_NONE; unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; flags = pol->flags; } switch (mode) { case MPOL_DEFAULT: break; case MPOL_PREFERRED: if (flags & MPOL_F_LOCAL) mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; case MPOL_BIND: case MPOL_INTERLEAVE: nodes = pol->v.nodes; break; default: WARN_ON_ONCE(1); snprintf(p, maxlen, "unknown"); return; } p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); } if (!nodes_empty(nodes)) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * include/linux/eventpoll.h ( Efficient event polling implementation ) * Copyright (C) 2001,...,2006 Davide Libenzi * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Davide Libenzi <davidel@xmailserver.org> * */ #ifndef _UAPI_LINUX_EVENTPOLL_H #define _UAPI_LINUX_EVENTPOLL_H /* For O_CLOEXEC */ #include <linux/fcntl.h> #include <linux/types.h> /* Flags for epoll_create1. */ #define EPOLL_CLOEXEC O_CLOEXEC /* Valid opcodes to issue to sys_epoll_ctl() */ #define EPOLL_CTL_ADD 1 #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 /* Epoll event masks */ #define EPOLLIN (__force __poll_t)0x00000001 #define EPOLLPRI (__force __poll_t)0x00000002 #define EPOLLOUT (__force __poll_t)0x00000004 #define EPOLLERR (__force __poll_t)0x00000008 #define EPOLLHUP (__force __poll_t)0x00000010 #define EPOLLNVAL (__force __poll_t)0x00000020 #define EPOLLRDNORM (__force __poll_t)0x00000040 #define EPOLLRDBAND (__force __poll_t)0x00000080 #define EPOLLWRNORM (__force __poll_t)0x00000100 #define EPOLLWRBAND (__force __poll_t)0x00000200 #define EPOLLMSG (__force __poll_t)0x00000400 #define EPOLLRDHUP (__force __poll_t)0x00002000 /* Set exclusive wakeup mode for the target file descriptor */ #define EPOLLEXCLUSIVE ((__force __poll_t)(1U << 28)) /* * Request the handling of system wakeup events so as to prevent system suspends * from happening while those events are being processed. * * Assuming neither EPOLLET nor EPOLLONESHOT is set, system suspends will not be * re-allowed until epoll_wait is called again after consuming the wakeup * event(s). * * Requires CAP_BLOCK_SUSPEND */ #define EPOLLWAKEUP ((__force __poll_t)(1U << 29)) /* Set the One Shot behaviour for the target file descriptor */ #define EPOLLONESHOT ((__force __poll_t)(1U << 30)) /* Set the Edge Triggered behaviour for the target file descriptor */ #define EPOLLET ((__force __poll_t)(1U << 31)) /* * On x86-64 make the 64bit structure have the same alignment as the * 32bit structure. This makes 32bit emulation easier. * * UML/x86_64 needs the same packing as x86_64 */ #ifdef __x86_64__ #define EPOLL_PACKED __attribute__((packed)) #else #define EPOLL_PACKED #endif struct epoll_event { __poll_t events; __u64 data; } EPOLL_PACKED; #ifdef CONFIG_PM_SLEEP static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) epev->events &= ~EPOLLWAKEUP; } #else static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { epev->events &= ~EPOLLWAKEUP; } #endif #endif /* _UAPI_LINUX_EVENTPOLL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CTYPE_H #define _LINUX_CTYPE_H /* * NOTE! This ctype does not handle EOF like the standard C * library is required to. */ #define _U 0x01 /* upper */ #define _L 0x02 /* lower */ #define _D 0x04 /* digit */ #define _C 0x08 /* cntrl */ #define _P 0x10 /* punct */ #define _S 0x20 /* white space (space/lf/tab) */ #define _X 0x40 /* hex digit */ #define _SP 0x80 /* hard space (0x20) */ extern const unsigned char _ctype[]; #define __ismask(x) (_ctype[(int)(unsigned char)(x)]) #define isalnum(c) ((__ismask(c)&(_U|_L|_D)) != 0) #define isalpha(c) ((__ismask(c)&(_U|_L)) != 0) #define iscntrl(c) ((__ismask(c)&(_C)) != 0) static inline int isdigit(int c) { return '0' <= c && c <= '9'; } #define isgraph(c) ((__ismask(c)&(_P|_U|_L|_D)) != 0) #define islower(c) ((__ismask(c)&(_L)) != 0) #define isprint(c) ((__ismask(c)&(_P|_U|_L|_D|_SP)) != 0) #define ispunct(c) ((__ismask(c)&(_P)) != 0) /* Note: isspace() must return false for %NUL-terminator */ #define isspace(c) ((__ismask(c)&(_S)) != 0) #define isupper(c) ((__ismask(c)&(_U)) != 0) #define isxdigit(c) ((__ismask(c)&(_D|_X)) != 0) #define isascii(c) (((unsigned char)(c))<=0x7f) #define toascii(c) (((unsigned char)(c))&0x7f) static inline unsigned char __tolower(unsigned char c) { if (isupper(c)) c -= 'A'-'a'; return c; } static inline unsigned char __toupper(unsigned char c) { if (islower(c)) c -= 'a'-'A'; return c; } #define tolower(c) __tolower(c) #define toupper(c) __toupper(c) /* * Fast implementation of tolower() for internal usage. Do not use in your * code. */ static inline char _tolower(const char c) { return c | 0x20; } /* Fast check for octal digit */ static inline int isodigit(const char c) { return c >= '0' && c <= '7'; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _ASM_X86_INAT_H #define _ASM_X86_INAT_H /* * x86 instruction attributes * * Written by Masami Hiramatsu <mhiramat@redhat.com> */ #include <asm/inat_types.h> /* * Internal bits. Don't use bitmasks directly, because these bits are * unstable. You should use checking functions. */ #define INAT_OPCODE_TABLE_SIZE 256 #define INAT_GROUP_TABLE_SIZE 8 /* Legacy last prefixes */ #define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */ #define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */ #define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */ /* Other Legacy prefixes */ #define INAT_PFX_LOCK 4 /* 0xF0 */ #define INAT_PFX_CS 5 /* 0x2E */ #define INAT_PFX_DS 6 /* 0x3E */ #define INAT_PFX_ES 7 /* 0x26 */ #define INAT_PFX_FS 8 /* 0x64 */ #define INAT_PFX_GS 9 /* 0x65 */ #define INAT_PFX_SS 10 /* 0x36 */ #define INAT_PFX_ADDRSZ 11 /* 0x67 */ /* x86-64 REX prefix */ #define INAT_PFX_REX 12 /* 0x4X */ /* AVX VEX prefixes */ #define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */ #define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */ #define INAT_PFX_EVEX 15 /* EVEX prefix */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 /* Immediate size */ #define INAT_IMM_BYTE 1 #define INAT_IMM_WORD 2 #define INAT_IMM_DWORD 3 #define INAT_IMM_QWORD 4 #define INAT_IMM_PTR 5 #define INAT_IMM_VWORD32 6 #define INAT_IMM_VWORD 7 /* Legacy prefix */ #define INAT_PFX_OFFS 0 #define INAT_PFX_BITS 4 #define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) #define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) /* Escape opcodes */ #define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS) #define INAT_ESC_BITS 2 #define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1) #define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS) /* Group opcodes (1-16) */ #define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS) #define INAT_GRP_BITS 5 #define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1) #define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS) /* Immediates */ #define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS) #define INAT_IMM_BITS 3 #define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS) /* Flags */ #define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS) #define INAT_MODRM (1 << (INAT_FLAG_OFFS)) #define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1)) #define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2)) #define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) /* Attribute making macros for attribute tables */ #define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) #define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) /* Identifiers for segment registers */ #define INAT_SEG_REG_IGNORE 0 #define INAT_SEG_REG_DEFAULT 1 #define INAT_SEG_REG_CS 2 #define INAT_SEG_REG_SS 3 #define INAT_SEG_REG_DS 4 #define INAT_SEG_REG_ES 5 #define INAT_SEG_REG_FS 6 #define INAT_SEG_REG_GS 7 /* Attribute search APIs */ extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); extern int inat_get_last_prefix_id(insn_byte_t last_pfx); extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id, insn_attr_t esc_attr); extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id, insn_attr_t esc_attr); extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_pp); /* Attribute checking functions */ static inline int inat_is_legacy_prefix(insn_attr_t attr) { attr &= INAT_PFX_MASK; return attr && attr <= INAT_LGCPFX_MAX; } static inline int inat_is_address_size_prefix(insn_attr_t attr) { return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ; } static inline int inat_is_operand_size_prefix(insn_attr_t attr) { return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ; } static inline int inat_is_rex_prefix(insn_attr_t attr) { return (attr & INAT_PFX_MASK) == INAT_PFX_REX; } static inline int inat_last_prefix_id(insn_attr_t attr) { if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX) return 0; else return attr & INAT_PFX_MASK; } static inline int inat_is_vex_prefix(insn_attr_t attr) { attr &= INAT_PFX_MASK; return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3 || attr == INAT_PFX_EVEX; } static inline int inat_is_evex_prefix(insn_attr_t attr) { return (attr & INAT_PFX_MASK) == INAT_PFX_EVEX; } static inline int inat_is_vex3_prefix(insn_attr_t attr) { return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; } static inline int inat_is_escape(insn_attr_t attr) { return attr & INAT_ESC_MASK; } static inline int inat_escape_id(insn_attr_t attr) { return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS; } static inline int inat_is_group(insn_attr_t attr) { return attr & INAT_GRP_MASK; } static inline int inat_group_id(insn_attr_t attr) { return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS; } static inline int inat_group_common_attribute(insn_attr_t attr) { return attr & ~INAT_GRP_MASK; } static inline int inat_has_immediate(insn_attr_t attr) { return attr & INAT_IMM_MASK; } static inline int inat_immediate_size(insn_attr_t attr) { return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS; } static inline int inat_has_modrm(insn_attr_t attr) { return attr & INAT_MODRM; } static inline int inat_is_force64(insn_attr_t attr) { return attr & INAT_FORCE64; } static inline int inat_has_second_immediate(insn_attr_t attr) { return attr & INAT_SCNDIMM; } static inline int inat_has_moffset(insn_attr_t attr) { return attr & INAT_MOFFSET; } static inline int inat_has_variant(insn_attr_t attr) { return attr & INAT_VARIANT; } static inline int inat_accept_vex(insn_attr_t attr) { return attr & INAT_VEXOK; } static inline int inat_must_vex(insn_attr_t attr) { return attr & (INAT_VEXONLY | INAT_EVEXONLY); } static inline int inat_must_evex(insn_attr_t attr) { return attr & INAT_EVEXONLY; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ #include <linux/range.h> #include <linux/ioport.h> #include <linux/percpu-refcount.h> struct resource; struct device; /** * struct vmem_altmap - pre-allocated storage for vmemmap_populate * @base_pfn: base of the entire dev_pagemap mapping * @reserve: pages mapped, but reserved for driver use (relative to @base) * @free: free pages set aside in the mapping for memmap storage * @align: pages reserved to meet allocation alignments * @alloc: track pages consumed, private to vmemmap_populate() */ struct vmem_altmap { const unsigned long base_pfn; const unsigned long end_pfn; const unsigned long reserve; unsigned long free; unsigned long align; unsigned long alloc; }; /* * Specialize ZONE_DEVICE memory into multiple types each having differents * usage. * * MEMORY_DEVICE_PRIVATE: * Device memory that is not directly addressable by the CPU: CPU can neither * read nor write private memory. In this case, we do still have struct pages * backing the device memory. Doing so simplifies the implementation, but it is * important to remember that there are certain points at which the struct page * must be treated as an opaque object, rather than a "normal" struct page. * * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/vm/hmm.rst. * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. In support of coordinating page * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a * wakeup event whenever a page is unpinned and becomes idle. This * wakeup is used to coordinate physical address space management (ex: * fs truncate/hole punch) vs pinned pages (ex: device dma). * * MEMORY_DEVICE_GENERIC: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. This is for example used by DAX devices * that expose memory using a character device. * * MEMORY_DEVICE_PCI_P2PDMA: * Device memory residing in a PCI BAR intended for use with Peer-to-Peer * transactions. */ enum memory_type { /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_PCI_P2PDMA, }; struct dev_pagemap_ops { /* * Called once the page refcount reaches 1. (ZONE_DEVICE pages never * reach 0 refcount unless there is a refcount bug. This allows the * device driver to implement its own memory management.) */ void (*page_free)(struct page *page); /* * Transition the refcount in struct dev_pagemap to the dead state. */ void (*kill)(struct dev_pagemap *pgmap); /* * Wait for refcount in struct dev_pagemap to be idle and reap it. */ void (*cleanup)(struct dev_pagemap *pgmap); /* * Used for private (un-addressable) device memory only. Must migrate * the page back to a CPU accessible page. */ vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); }; #define PGMAP_ALTMAP_VALID (1 << 0) /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations * @ref: reference count that pins the devm_memremap_pages() mapping * @internal_ref: internal reference if @ref is not provided by the caller * @done: completion for @internal_ref * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior * @ops: method table * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no * foreign ZONE_DEVICE memory is accessed. * @nr_range: number of ranges to be mapped * @range: range to be mapped when nr_range == 1 * @ranges: array of ranges to be mapped when nr_range > 1 */ struct dev_pagemap { struct vmem_altmap altmap; struct percpu_ref *ref; struct percpu_ref internal_ref; struct completion done; enum memory_type type; unsigned int flags; const struct dev_pagemap_ops *ops; void *owner; int nr_range; union { struct range range; struct range ranges[0]; }; }; static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) { if (pgmap->flags & PGMAP_ALTMAP_VALID) return &pgmap->altmap; return NULL; } #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { /* * Fail attempts to call devm_memremap_pages() without * ZONE_DEVICE support enabled, this requires callers to fall * back to plain devm_memremap() based on config */ WARN_ON_ONCE(1); return ERR_PTR(-ENXIO); } static inline void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) { } static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap) { return NULL; } static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) { return false; } static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { return 0; } static inline void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) { } /* when memremap_pages() is disabled all archs can remap a single page */ static inline unsigned long memremap_compat_align(void) { return PAGE_SIZE; } #endif /* CONFIG_ZONE_DEVICE */ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) { if (pgmap) percpu_ref_put(pgmap->ref); } #endif /* _LINUX_MEMREMAP_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tlb #if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TLB_H #include <linux/mm_types.h> #include <linux/tracepoint.h> #define TLB_FLUSH_REASON \ EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \ EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \ EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \ EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \ EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" ) /* * First define the enums in TLB_FLUSH_REASON to be exported to userspace * via TRACE_DEFINE_ENUM(). */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); TLB_FLUSH_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } TRACE_EVENT(tlb_flush, TP_PROTO(int reason, unsigned long pages), TP_ARGS(reason, pages), TP_STRUCT__entry( __field( int, reason) __field(unsigned long, pages) ), TP_fast_assign( __entry->reason = reason; __entry->pages = pages; ), TP_printk("pages:%ld reason:%s (%d)", __entry->pages, __print_symbolic(__entry->reason, TLB_FLUSH_REASON), __entry->reason) ); #endif /* _TRACE_TLB_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifing that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (atomic_dec_and_test(&parent->count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /** * idmap_key struct holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; /* == 0 unless used with map_id_range_down() */ }; /** * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /** * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /** * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } static u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /** * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { unsigned idx; u32 first, last; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) return &map->extent[idx]; } return NULL; } /** * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } static u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_up_base(extents, map, id); else extent = map_id_up_max(extents, map, id); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /** * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /** * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup(map->forward, map->nr_extents * sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certaint the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also had the approprpiate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); return 0; } subsys_initcall(user_namespaces_init);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Cryptographic API. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_H #define _CRYPTO_INTERNAL_H #include <crypto/algapi.h> #include <linux/completion.h> #include <linux/list.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/numa.h> #include <linux/refcount.h> #include <linux/rwsem.h> #include <linux/sched.h> #include <linux/types.h> struct crypto_instance; struct crypto_template; struct crypto_larval { struct crypto_alg alg; struct crypto_alg *adult; struct completion completion; u32 mask; }; extern struct list_head crypto_alg_list; extern struct rw_semaphore crypto_alg_sem; extern struct blocking_notifier_head crypto_chain; #ifdef CONFIG_PROC_FS void __init crypto_init_proc(void); void __exit crypto_exit_proc(void); #else static inline void crypto_init_proc(void) { } static inline void crypto_exit_proc(void) { } #endif static inline unsigned int crypto_cipher_ctxsize(struct crypto_alg *alg) { return alg->cra_ctxsize; } static inline unsigned int crypto_compress_ctxsize(struct crypto_alg *alg) { return alg->cra_ctxsize; } struct crypto_alg *crypto_mod_get(struct crypto_alg *alg); struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask); struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask); void crypto_larval_kill(struct crypto_alg *alg); void crypto_alg_tested(const char *name, int err); void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, struct crypto_alg *nalg); void crypto_remove_final(struct list_head *list); void crypto_shoot_alg(struct crypto_alg *alg); struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type, u32 mask); void *crypto_create_tfm_node(struct crypto_alg *alg, const struct crypto_type *frontend, int node); static inline void *crypto_create_tfm(struct crypto_alg *alg, const struct crypto_type *frontend) { return crypto_create_tfm_node(alg, frontend, NUMA_NO_NODE); } struct crypto_alg *crypto_find_alg(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask); void *crypto_alloc_tfm_node(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask, int node); static inline void *crypto_alloc_tfm(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask) { return crypto_alloc_tfm_node(alg_name, frontend, type, mask, NUMA_NO_NODE); } int crypto_probing_notify(unsigned long val, void *v); unsigned int crypto_alg_extsize(struct crypto_alg *alg); int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, u32 type, u32 mask); static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg) { refcount_inc(&alg->cra_refcnt); return alg; } static inline void crypto_alg_put(struct crypto_alg *alg) { if (refcount_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy) alg->cra_destroy(alg); } static inline int crypto_tmpl_get(struct crypto_template *tmpl) { return try_module_get(tmpl->module); } static inline void crypto_tmpl_put(struct crypto_template *tmpl) { module_put(tmpl->module); } static inline int crypto_is_larval(struct crypto_alg *alg) { return alg->cra_flags & CRYPTO_ALG_LARVAL; } static inline int crypto_is_dead(struct crypto_alg *alg) { return alg->cra_flags & CRYPTO_ALG_DEAD; } static inline int crypto_is_moribund(struct crypto_alg *alg) { return alg->cra_flags & (CRYPTO_ALG_DEAD | CRYPTO_ALG_DYING); } static inline void crypto_notify(unsigned long val, void *v) { blocking_notifier_call_chain(&crypto_chain, val, v); } static inline void crypto_yield(u32 flags) { if (flags & CRYPTO_TFM_REQ_MAY_SLEEP) cond_resched(); } #endif /* _CRYPTO_INTERNAL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 /* SPDX-License-Identifier: GPL-2.0-only */ /* * pm_runtime.h - Device run-time power management helper functions. * * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl> */ #ifndef _LINUX_PM_RUNTIME_H #define _LINUX_PM_RUNTIME_H #include <linux/device.h> #include <linux/notifier.h> #include <linux/pm.h> #include <linux/jiffies.h> /* Runtime PM flag argument bits */ #define RPM_ASYNC 0x01 /* Request is asynchronous */ #define RPM_NOWAIT 0x02 /* Don't wait for concurrent state change */ #define RPM_GET_PUT 0x04 /* Increment/decrement the usage_count */ #define RPM_AUTO 0x08 /* Use autosuspend_delay */ #ifdef CONFIG_PM extern struct workqueue_struct *pm_wq; static inline bool queue_pm_work(struct work_struct *work) { return queue_work(pm_wq, work); } extern int pm_generic_runtime_suspend(struct device *dev); extern int pm_generic_runtime_resume(struct device *dev); extern int pm_runtime_force_suspend(struct device *dev); extern int pm_runtime_force_resume(struct device *dev); extern int __pm_runtime_idle(struct device *dev, int rpmflags); extern int __pm_runtime_suspend(struct device *dev, int rpmflags); extern int __pm_runtime_resume(struct device *dev, int rpmflags); extern int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); extern void pm_runtime_allow(struct device *dev); extern void pm_runtime_forbid(struct device *dev); extern void pm_runtime_no_callbacks(struct device *dev); extern void pm_runtime_irq_safe(struct device *dev); extern void __pm_runtime_use_autosuspend(struct device *dev, bool use); extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay); extern u64 pm_runtime_autosuspend_expiration(struct device *dev); extern void pm_runtime_update_max_time_suspended(struct device *dev, s64 delta_ns); extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable); extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); extern void pm_runtime_drop_link(struct device_link *link); /** * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. * @dev: Target device. * * Increment the runtime PM usage counter of @dev if its runtime PM status is * %RPM_ACTIVE and its runtime PM usage counter is greater than 0. */ static inline int pm_runtime_get_if_in_use(struct device *dev) { return pm_runtime_get_if_active(dev, false); } /** * pm_suspend_ignore_children - Set runtime PM behavior regarding children. * @dev: Target device. * @enable: Whether or not to ignore possible dependencies on children. * * The dependencies of @dev on its children will not be taken into account by * the runtime PM framework going forward if @enable is %true, or they will * be taken into account otherwise. */ static inline void pm_suspend_ignore_children(struct device *dev, bool enable) { dev->power.ignore_children = enable; } /** * pm_runtime_get_noresume - Bump up runtime PM usage counter of a device. * @dev: Target device. */ static inline void pm_runtime_get_noresume(struct device *dev) { atomic_inc(&dev->power.usage_count); } /** * pm_runtime_put_noidle - Drop runtime PM usage counter of a device. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev unless it is 0 already. */ static inline void pm_runtime_put_noidle(struct device *dev) { atomic_add_unless(&dev->power.usage_count, -1, 0); } /** * pm_runtime_suspended - Check whether or not a device is runtime-suspended. * @dev: Target device. * * Return %true if runtime PM is enabled for @dev and its runtime PM status is * %RPM_SUSPENDED, or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev and its runtime PM * status cannot change. */ static inline bool pm_runtime_suspended(struct device *dev) { return dev->power.runtime_status == RPM_SUSPENDED && !dev->power.disable_depth; } /** * pm_runtime_active - Check whether or not a device is runtime-active. * @dev: Target device. * * Return %true if runtime PM is disabled for @dev or its runtime PM status is * %RPM_ACTIVE, or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev and its runtime PM * status cannot change. */ static inline bool pm_runtime_active(struct device *dev) { return dev->power.runtime_status == RPM_ACTIVE || dev->power.disable_depth; } /** * pm_runtime_status_suspended - Check if runtime PM status is "suspended". * @dev: Target device. * * Return %true if the runtime PM status of @dev is %RPM_SUSPENDED, or %false * otherwise, regardless of whether or not runtime PM has been enabled for @dev. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which the * runtime PM status of @dev cannot change. */ static inline bool pm_runtime_status_suspended(struct device *dev) { return dev->power.runtime_status == RPM_SUSPENDED; } /** * pm_runtime_enabled - Check if runtime PM is enabled. * @dev: Target device. * * Return %true if runtime PM is enabled for @dev or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev. */ static inline bool pm_runtime_enabled(struct device *dev) { return !dev->power.disable_depth; } /** * pm_runtime_has_no_callbacks - Check if runtime PM callbacks may be present. * @dev: Target device. * * Return %true if @dev is a special device without runtime PM callbacks or * %false otherwise. */ static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return dev->power.no_callbacks; } /** * pm_runtime_mark_last_busy - Update the last access time of a device. * @dev: Target device. * * Update the last access time of @dev used by the runtime PM autosuspend * mechanism to the current time as returned by ktime_get_mono_fast_ns(). */ static inline void pm_runtime_mark_last_busy(struct device *dev) { WRITE_ONCE(dev->power.last_busy, ktime_get_mono_fast_ns()); } /** * pm_runtime_is_irq_safe - Check if runtime PM can work in interrupt context. * @dev: Target device. * * Return %true if @dev has been marked as an "IRQ-safe" device (with respect * to runtime PM), in which case its runtime PM callabcks can be expected to * work correctly when invoked from interrupt handlers. */ static inline bool pm_runtime_is_irq_safe(struct device *dev) { return dev->power.irq_safe; } extern u64 pm_runtime_suspended_time(struct device *dev); #else /* !CONFIG_PM */ static inline bool queue_pm_work(struct work_struct *work) { return false; } static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; } static inline int pm_generic_runtime_resume(struct device *dev) { return 0; } static inline int pm_runtime_force_suspend(struct device *dev) { return 0; } static inline int pm_runtime_force_resume(struct device *dev) { return 0; } static inline int __pm_runtime_idle(struct device *dev, int rpmflags) { return -ENOSYS; } static inline int __pm_runtime_suspend(struct device *dev, int rpmflags) { return -ENOSYS; } static inline int __pm_runtime_resume(struct device *dev, int rpmflags) { return 1; } static inline int pm_schedule_suspend(struct device *dev, unsigned int delay) { return -ENOSYS; } static inline int pm_runtime_get_if_in_use(struct device *dev) { return -EINVAL; } static inline int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count) { return -EINVAL; } static inline int __pm_runtime_set_status(struct device *dev, unsigned int status) { return 0; } static inline int pm_runtime_barrier(struct device *dev) { return 0; } static inline void pm_runtime_enable(struct device *dev) {} static inline void __pm_runtime_disable(struct device *dev, bool c) {} static inline void pm_runtime_allow(struct device *dev) {} static inline void pm_runtime_forbid(struct device *dev) {} static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {} static inline void pm_runtime_get_noresume(struct device *dev) {} static inline void pm_runtime_put_noidle(struct device *dev) {} static inline bool pm_runtime_suspended(struct device *dev) { return false; } static inline bool pm_runtime_active(struct device *dev) { return true; } static inline bool pm_runtime_status_suspended(struct device *dev) { return false; } static inline bool pm_runtime_enabled(struct device *dev) { return false; } static inline void pm_runtime_no_callbacks(struct device *dev) {} static inline void pm_runtime_irq_safe(struct device *dev) {} static inline bool pm_runtime_is_irq_safe(struct device *dev) { return false; } static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return false; } static inline void pm_runtime_mark_last_busy(struct device *dev) {} static inline void __pm_runtime_use_autosuspend(struct device *dev, bool use) {} static inline void pm_runtime_set_autosuspend_delay(struct device *dev, int delay) {} static inline u64 pm_runtime_autosuspend_expiration( struct device *dev) { return 0; } static inline void pm_runtime_set_memalloc_noio(struct device *dev, bool enable){} static inline void pm_runtime_get_suppliers(struct device *dev) {} static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} static inline void pm_runtime_drop_link(struct device_link *link) {} #endif /* !CONFIG_PM */ /** * pm_runtime_idle - Conditionally set up autosuspend of a device or suspend it. * @dev: Target device. * * Invoke the "idle check" callback of @dev and, depending on its return value, * set up autosuspend of @dev or suspend it (depending on whether or not * autosuspend has been enabled for it). */ static inline int pm_runtime_idle(struct device *dev) { return __pm_runtime_idle(dev, 0); } /** * pm_runtime_suspend - Suspend a device synchronously. * @dev: Target device. */ static inline int pm_runtime_suspend(struct device *dev) { return __pm_runtime_suspend(dev, 0); } /** * pm_runtime_autosuspend - Set up autosuspend of a device or suspend it. * @dev: Target device. * * Set up autosuspend of @dev or suspend it (depending on whether or not * autosuspend is enabled for it) without engaging its "idle check" callback. */ static inline int pm_runtime_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_AUTO); } /** * pm_runtime_resume - Resume a device synchronously. * @dev: Target device. */ static inline int pm_runtime_resume(struct device *dev) { return __pm_runtime_resume(dev, 0); } /** * pm_request_idle - Queue up "idle check" execution for a device. * @dev: Target device. * * Queue up a work item to run an equivalent of pm_runtime_idle() for @dev * asynchronously. */ static inline int pm_request_idle(struct device *dev) { return __pm_runtime_idle(dev, RPM_ASYNC); } /** * pm_request_resume - Queue up runtime-resume of a device. * @dev: Target device. */ static inline int pm_request_resume(struct device *dev) { return __pm_runtime_resume(dev, RPM_ASYNC); } /** * pm_request_autosuspend - Queue up autosuspend of a device. * @dev: Target device. * * Queue up a work item to run an equivalent pm_runtime_autosuspend() for @dev * asynchronously. */ static inline int pm_request_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_ASYNC | RPM_AUTO); } /** * pm_runtime_get - Bump up usage counter and queue up resume of a device. * @dev: Target device. * * Bump up the runtime PM usage counter of @dev and queue up a work item to * carry out runtime-resume of it. */ static inline int pm_runtime_get(struct device *dev) { return __pm_runtime_resume(dev, RPM_GET_PUT | RPM_ASYNC); } /** * pm_runtime_get_sync - Bump up usage counter of a device and resume it. * @dev: Target device. * * Bump up the runtime PM usage counter of @dev and carry out runtime-resume of * it synchronously. * * The possible return values of this function are the same as for * pm_runtime_resume() and the runtime PM usage counter of @dev remains * incremented in all cases, even if it returns an error code. */ static inline int pm_runtime_get_sync(struct device *dev) { return __pm_runtime_resume(dev, RPM_GET_PUT); } /** * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. * @dev: Target device. * * Resume @dev synchronously and if that is successful, increment its runtime * PM usage counter. Return 0 if the runtime PM usage counter of @dev has been * incremented or a negative error code otherwise. */ static inline int pm_runtime_resume_and_get(struct device *dev) { int ret; ret = __pm_runtime_resume(dev, RPM_GET_PUT); if (ret < 0) { pm_runtime_put_noidle(dev); return ret; } return 0; } /** * pm_runtime_put - Drop device usage counter and queue up "idle check" if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_idle(). */ static inline int pm_runtime_put(struct device *dev) { return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } /** * pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). */ static inline int pm_runtime_put_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_ASYNC | RPM_AUTO); } /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, invoke the "idle check" callback of @dev and, depending on its * return value, set up autosuspend of @dev or suspend it (depending on whether * or not autosuspend has been enabled for it). * * The possible return values of this function are the same as for * pm_runtime_idle() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync(struct device *dev) { return __pm_runtime_idle(dev, RPM_GET_PUT); } /** * pm_runtime_put_sync_suspend - Drop device usage counter and suspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, carry out runtime-suspend of @dev synchronously. * * The possible return values of this function are the same as for * pm_runtime_suspend() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync_suspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT); } /** * pm_runtime_put_sync_autosuspend - Drop device usage counter and autosuspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, set up autosuspend of @dev or suspend it synchronously (depending * on whether or not autosuspend has been enabled for it). * * The possible return values of this function are the same as for * pm_runtime_autosuspend() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_AUTO); } /** * pm_runtime_set_active - Set runtime PM status to "active". * @dev: Target device. * * Set the runtime PM status of @dev to %RPM_ACTIVE and ensure that dependencies * of it will be taken into account. * * It is not valid to call this function for devices with runtime PM enabled. */ static inline int pm_runtime_set_active(struct device *dev) { return __pm_runtime_set_status(dev, RPM_ACTIVE); } /** * pm_runtime_set_suspended - Set runtime PM status to "suspended". * @dev: Target device. * * Set the runtime PM status of @dev to %RPM_SUSPENDED and ensure that * dependencies of it will be taken into account. * * It is not valid to call this function for devices with runtime PM enabled. */ static inline int pm_runtime_set_suspended(struct device *dev) { return __pm_runtime_set_status(dev, RPM_SUSPENDED); } /** * pm_runtime_disable - Disable runtime PM for a device. * @dev: Target device. * * Prevent the runtime PM framework from working with @dev (by incrementing its * "blocking" counter). * * For each invocation of this function for @dev there must be a matching * pm_runtime_enable() call in order for runtime PM to be enabled for it. */ static inline void pm_runtime_disable(struct device *dev) { __pm_runtime_disable(dev, true); } /** * pm_runtime_use_autosuspend - Allow autosuspend to be used for a device. * @dev: Target device. * * Allow the runtime PM autosuspend mechanism to be used for @dev whenever * requested (or "autosuspend" will be handled as direct runtime-suspend for * it). */ static inline void pm_runtime_use_autosuspend(struct device *dev) { __pm_runtime_use_autosuspend(dev, true); } /** * pm_runtime_dont_use_autosuspend - Prevent autosuspend from being used. * @dev: Target device. * * Prevent the runtime PM autosuspend mechanism from being used for @dev which * means that "autosuspend" will be handled as direct runtime-suspend for it * going forward. */ static inline void pm_runtime_dont_use_autosuspend(struct device *dev) { __pm_runtime_use_autosuspend(dev, false); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 /* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * This file is provided under a dual BSD/GPLv2 license. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #ifndef _LINUX_SIPHASH_H #define _LINUX_SIPHASH_H #include <linux/types.h> #include <linux/kernel.h> #define SIPHASH_ALIGNMENT __alignof__(u64) typedef struct { u64 key[2]; } siphash_key_t; static inline bool siphash_key_is_zero(const siphash_key_t *key) { return !(key->key[0] | key->key[1]); } u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); u64 siphash_1u64(const u64 a, const siphash_key_t *key); u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); u64 siphash_3u64(const u64 a, const u64 b, const u64 c, const siphash_key_t *key); u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d, const siphash_key_t *key); u64 siphash_1u32(const u32 a, const siphash_key_t *key); u64 siphash_3u32(const u32 a, const u32 b, const u32 c, const siphash_key_t *key); static inline u64 siphash_2u32(const u32 a, const u32 b, const siphash_key_t *key) { return siphash_1u64((u64)b << 32 | a, key); } static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, const siphash_key_t *key) { return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key); } static inline u64 ___siphash_aligned(const __le64 *data, size_t len, const siphash_key_t *key) { if (__builtin_constant_p(len) && len == 4) return siphash_1u32(le32_to_cpup((const __le32 *)data), key); if (__builtin_constant_p(len) && len == 8) return siphash_1u64(le64_to_cpu(data[0]), key); if (__builtin_constant_p(len) && len == 16) return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), key); if (__builtin_constant_p(len) && len == 24) return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), le64_to_cpu(data[2]), key); if (__builtin_constant_p(len) && len == 32) return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), le64_to_cpu(data[2]), le64_to_cpu(data[3]), key); return __siphash_aligned(data, len, key); } /** * siphash - compute 64-bit siphash PRF value * @data: buffer to hash * @size: size of @data * @key: the siphash key */ static inline u64 siphash(const void *data, size_t len, const siphash_key_t *key) { if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) return __siphash_unaligned(data, len, key); return ___siphash_aligned(data, len, key); } #define HSIPHASH_ALIGNMENT __alignof__(unsigned long) typedef struct { unsigned long key[2]; } hsiphash_key_t; u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key); u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key); u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c, const hsiphash_key_t *key); u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, const hsiphash_key_t *key); static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, const hsiphash_key_t *key) { if (__builtin_constant_p(len) && len == 4) return hsiphash_1u32(le32_to_cpu(data[0]), key); if (__builtin_constant_p(len) && len == 8) return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), key); if (__builtin_constant_p(len) && len == 12) return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), le32_to_cpu(data[2]), key); if (__builtin_constant_p(len) && len == 16) return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), le32_to_cpu(data[2]), le32_to_cpu(data[3]), key); return __hsiphash_aligned(data, len, key); } /** * hsiphash - compute 32-bit hsiphash PRF value * @data: buffer to hash * @size: size of @data * @key: the hsiphash key */ static inline u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key) { if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) return __hsiphash_unaligned(data, len, key); return ___hsiphash_aligned(data, len, key); } #endif /* _LINUX_SIPHASH_H */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGTABLE_H #define _LINUX_PGTABLE_H #include <linux/pfn.h> #include <asm/pgtable.h> #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU #include <linux/mm_types.h> #include <linux/bug.h> #include <linux/errno.h> #include <asm-generic/pgtable_uffd.h> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same * effect as using TASK_SIZE. However, there is one configuration which * must impose a more careful limit, to avoid freeing kernel pgtables. */ #ifndef USER_PGTABLES_CEILING #define USER_PGTABLES_CEILING 0UL #endif /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * * The pXx_index() functions return the index of the entry in the page * table page which would control the given virtual address * * As these functions may be used by the same code for different levels of * the page table folding, they are always available, regardless of * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0 * because in such cases PTRS_PER_PxD equals 1. */ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } #define pmd_index pmd_index #endif #ifndef pud_index static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } #define pud_index pud_index #endif #ifndef pgd_index /* Must be a compile-time constant, so implement it as a macro */ #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); } #define pte_offset_kernel pte_offset_kernel #endif #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \ pte_index((address))) #define pte_unmap(pte) kunmap_atomic((pte)) #else #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) ((void)(pte)) /* NOP */ #endif /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } #define pmd_offset pmd_offset #endif #ifndef pud_offset static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address); } #define pud_offset pud_offset #endif static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address) { return (pgd + pgd_index(address)); }; /* * a shortcut to get a pgd_t in a given mm */ #ifndef pgd_offset #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) #endif /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ #ifndef pgd_offset_k #define pgd_offset_k(address) pgd_offset(&init_mm, (address)) #endif /* * In many cases it is known that a virtual address is mapped at PMD or PTE * level, so instead of traversing all the page table levels, we can get a * pointer to the PMD entry in user or kernel page table or translate a virtual * address to the pointer in the PTE in the kernel page tables with simple * helpers. */ static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_off_k(unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) { pmd_t *pmd = pmd_off_k(vaddr); return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { BUILD_BUG(); return 0; } static inline int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t pte = *ptep; int r = 1; if (!pte_young(pte)) r = 0; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); return r; } #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; int r = 1; if (!pmd_young(pmd)) r = 0; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); return r; } #else static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = *ptep; pte_clear(mm, address, ptep); return pte; } #endif #ifndef __HAVE_ARCH_PTEP_GET static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; pmd_clear(pmdp); return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t pud = *pudp; pud_clear(pudp); return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm, unsigned long address, pud_t *pudp, int full) { return pudp_huge_get_and_clear(mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_t pte; pte = ptep_get_and_clear(mm, address, ptep); return pte; } #endif /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread * gives up, simply does nothing, and continues; on architectures where * software can update TLB, local TLB can be updated here to avoid next page * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ #ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { } #define __HAVE_ARCH_UPDATE_MMU_TLB #endif /* * Some architectures may be able to avoid expensive synchronization * primitives when modifications are made to PTE's which are already * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL static inline void pte_clear_not_present_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_clear(mm, address, ptep); } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = *ptep; set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibilty of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ #ifndef pte_sw_mkyoung static inline pte_t pte_sw_mkyoung(pte_t pte) { return pte; } #define pte_sw_mkyoung pte_sw_mkyoung #endif #ifndef pte_savedwrite #define pte_savedwrite pte_write #endif #ifndef pte_mk_savedwrite #define pte_mk_savedwrite pte_mkwrite #endif #ifndef pte_clear_savedwrite #define pte_clear_savedwrite pte_wrprotect #endif #ifndef pmd_savedwrite #define pmd_savedwrite pmd_write #endif #ifndef pmd_mk_savedwrite #define pmd_mk_savedwrite pmd_mkwrite #endif #ifndef pmd_clear_savedwrite #define pmd_clear_savedwrite pmd_wrprotect #endif #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t old_pmd = *pmdp; set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); } #else static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t old_pud = *pudp; set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); } #else static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { BUILD_BUG(); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return *pmdp; } #define pmdp_collapse_flush pmdp_collapse_flush #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we * can't race with CPU which sets these bits and non-atomic aproach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { pmd_t old_pmd = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd); return old_pmd; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { return pte_val(pte_a) == pte_val(pte_b); } #endif #ifndef __HAVE_ARCH_PTE_UNUSED /* * Some architectures provide facilities to virtualization guests * so that they can flag allocated pages as unused. This allows the * host to transparently reclaim unused pages. This function returns * whether the pte's page is unused. */ static inline int pte_unused(pte_t pte) { return 0; } #endif #ifndef pte_access_permitted #define pte_access_permitted(pte, write) \ (pte_present(pte) && (!(write) || pte_write(pte))) #endif #ifndef pmd_access_permitted #define pmd_access_permitted(pmd, write) \ (pmd_present(pmd) && (!(write) || pmd_write(pmd))) #endif #ifndef pud_access_permitted #define pud_access_permitted(pud, write) \ (pud_present(pud) && (!(write) || pud_write(pud))) #endif #ifndef p4d_access_permitted #define p4d_access_permitted(p4d, write) \ (p4d_present(p4d) && (!(write) || p4d_write(p4d))) #endif #ifndef pgd_access_permitted #define pgd_access_permitted(pgd, write) \ (pgd_present(pgd) && (!(write) || pgd_write(pgd))) #endif #ifndef __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } #endif #ifndef __HAVE_ARCH_P4D_SAME static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b) { return p4d_val(p4d_a) == p4d_val(p4d_b); } #endif #ifndef __HAVE_ARCH_PGD_SAME static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) { return pgd_val(pgd_a) == pgd_val(pgd_b); } #endif /* * Use set_p*_safe(), and elide TLB flushing, when confident that *no* * TLB flush will be required as a result of the "set". For example, use * in scenarios where it is known ahead of time that the routine is * setting non-present entries, or re-setting an existing entry to the * same value. Otherwise, use the typical "set" helpers and flush the * TLB. */ #define set_pte_safe(ptep, pte) \ ({ \ WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ set_pte(ptep, pte); \ }) #define set_pmd_safe(pmdp, pmd) \ ({ \ WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ set_pmd(pmdp, pmd); \ }) #define set_pud_safe(pudp, pud) \ ({ \ WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ set_pud(pudp, pud); \ }) #define set_p4d_safe(p4dp, p4d) \ ({ \ WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ set_p4d(p4dp, p4d); \ }) #define set_pgd_safe(pgdp, pgd) \ ({ \ WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ set_pgd(pgdp, pgd); \ }) #ifndef __HAVE_ARCH_DO_SWAP_PAGE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ static inline void arch_do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte) { } #endif #ifndef __HAVE_ARCH_UNMAP_ONE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_unmap_one() can save this * metadata on a swap-out of a page. */ static inline int arch_unmap_one(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t orig_pte) { return 0; } #endif /* * Allow architectures to preserve additional metadata associated with * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP static inline int arch_prepare_to_swap(struct page *page) { return 0; } #endif #ifndef __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) { } static inline void arch_swap_invalidate_area(int type) { } #endif #ifndef __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct page *page) { } #endif #ifndef __HAVE_ARCH_PGD_OFFSET_GATE #define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) #endif #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, prot, old_addr, new_addr) (pte) #endif #ifndef pte_accessible # define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. */ #define pgd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #ifndef p4d_addr_end #define p4d_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pud_addr_end #define pud_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pmd_addr_end #define pmd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif /* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. * Do the tests inline, but report and clear the bad entry in mm/memory.c. */ void pgd_clear_bad(pgd_t *); #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); #else #define p4d_clear_bad(p4d) do { } while (0) #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); #else #define pud_clear_bad(p4d) do { } while (0) #endif void pmd_clear_bad(pmd_t *); static inline int pgd_none_or_clear_bad(pgd_t *pgd) { if (pgd_none(*pgd)) return 1; if (unlikely(pgd_bad(*pgd))) { pgd_clear_bad(pgd); return 1; } return 0; } static inline int p4d_none_or_clear_bad(p4d_t *p4d) { if (p4d_none(*p4d)) return 1; if (unlikely(p4d_bad(*p4d))) { p4d_clear_bad(p4d); return 1; } return 0; } static inline int pud_none_or_clear_bad(pud_t *pud) { if (pud_none(*pud)) return 1; if (unlikely(pud_bad(*pud))) { pud_clear_bad(pud); return 1; } return 0; } static inline int pmd_none_or_clear_bad(pmd_t *pmd) { if (pmd_none(*pmd)) return 1; if (unlikely(pmd_bad(*pmd))) { pmd_clear_bad(pmd); return 1; } return 0; } static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * Get the current pte state, but zero it out to make it * non-present, preventing the hardware from asynchronously * updating it. */ return ptep_get_and_clear(vma->vm_mm, addr, ptep); } static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { /* * The pte is non-present, so there's no hardware state to * preserve. */ set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION /* * Start a pte protection read-modify-write transaction, which * protects against asynchronous hardware modifications to the pte. * The intention is not to prevent the hardware from making pte * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the * pte; the appropriate pte lock must be held over the transation. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. */ #ifndef pgprot_nx #define pgprot_nx(prot) (prot) #endif #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif #ifndef pgprot_writecombine #define pgprot_writecombine pgprot_noncached #endif #ifndef pgprot_writethrough #define pgprot_writethrough pgprot_noncached #endif #ifndef pgprot_device #define pgprot_device pgprot_noncached #endif #ifndef pgprot_mhp #define pgprot_mhp(prot) (prot) #endif #ifdef CONFIG_MMU #ifndef pgprot_modify #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot))) newprot = pgprot_noncached(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot))) newprot = pgprot_writecombine(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot))) newprot = pgprot_device(newprot); return newprot; } #endif #endif /* CONFIG_MMU */ #ifndef pgprot_encrypted #define pgprot_encrypted(prot) (prot) #endif #ifndef pgprot_decrypted #define pgprot_decrypted(prot) (prot) #endif /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode * is issued. Some architectures may benefit from doing this, and it is * beneficial for both shadow and direct mode hypervisors, which may batch * the PTE updates which happen during this window. Note that using this * interface requires that read hazards be removed from the code. A read * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be * up to date. This mode can only be entered and left under the protection of * the page table locks for all page tables which may be modified. In the UP * case, this is required so that preemption is disabled, and in the SMP case, * it must synchronize the delayed page table writes properly on other CPUs. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE #define arch_enter_lazy_mmu_mode() do {} while (0) #define arch_leave_lazy_mmu_mode() do {} while (0) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for * paravirtualized guests. By convention, only one of the batched * update (lazy) modes (CPU, MMU) should be active at any given time, * entry should never be nested, and entry and exits should always be * paired. This is for sanity of maintaining and reasoning about the * kernel code. In this case, the exit (end of the context switch) is * in architecture-specific code, and so doesn't need a generic * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH #define arch_start_context_switch(prev) do {} while (0) #endif #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */ static inline int pte_soft_dirty(pte_t pte) { return 0; } static inline int pmd_soft_dirty(pmd_t pmd) { return 0; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte; } static inline int pte_swp_soft_dirty(pte_t pte) { return 0; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, * vmf_insert_pfn. */ /* * track_pfn_remap is called when a _new_ pfn mapping is being established * by remap_pfn_range() for physical range indicated by pfn and size. */ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) { return 0; } /* * track_pfn_insert is called when a _new_ single pfn is established * by vmf_insert_pfn(). */ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) { } /* * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). */ static inline int track_pfn_copy(struct vm_area_struct *vma) { return 0; } /* * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or * can be for the entire vma (in which case pfn, size are zero). */ static inline void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size) { } /* * untrack_pfn_moved is called while mremapping a pfnmap for a new region. */ static inline void untrack_pfn_moved(struct vm_area_struct *vma) { } #else extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size); extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size); extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; return pfn == zero_pfn; } static inline unsigned long my_zero_pfn(unsigned long addr) { extern unsigned long zero_pfn; return zero_pfn; } #endif #ifdef CONFIG_MMU #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { return 0; } #ifndef pmd_write static inline int pmd_write(pmd_t pmd) { BUG(); return 0; } #endif /* pmd_write */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef pud_write static inline int pud_write(pud_t pud) { BUG(); return 0; } #endif /* pud_write */ #if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline int pmd_devmap(pmd_t pmd) { return 0; } static inline int pud_devmap(pud_t pud) { return 0; } static inline int pgd_devmap(pgd_t pgd) { return 0; } #endif #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) static inline int pud_trans_huge(pud_t pud) { return 0; } #endif /* See pmd_none_or_trans_huge_or_clear_bad for discussion. */ static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud) { pud_t pudval = READ_ONCE(*pud); if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval)) return 1; if (unlikely(pud_bad(pudval))) { pud_clear_bad(pud); return 1; } return 0; } /* See pmd_trans_unstable for discussion. */ static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) return pud_none_or_trans_huge_or_dev_or_clear_bad(pud); #else return 0; #endif } #ifndef pmd_read_atomic static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { /* * Depend on compiler for an atomic pmd read. NOTE: this is * only going to work, if the pmdval_t isn't larger than * an unsigned long. */ return *pmdp; } #endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif /* * This function is meant to be used by sites walking pagetables with * the mmap_lock held in read mode to protect against MADV_DONTNEED and * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd * into a null pmd and the transhuge page fault can convert a null pmd * into an hugepmd or into a regular pmd (if the hugepage allocation * fails). While holding the mmap_lock in read mode the pmd becomes * stable and stops changing under us only if it's not null and not a * transhuge pmd. When those races occurs and this function makes a * difference vs the standard pmd_none_or_clear_bad, the result is * undefined so behaving like if the pmd was none is safe (because it * can return none anyway). The compiler level barrier() is critically * important to compute the two checks atomically on the same pmdval. * * For 32bit kernels with a 64bit large pmd_t this automatically takes * care of reading the pmd atomically to avoid SMP race conditions * against pmd_populate() when the mmap_lock is hold for reading by the * caller (a special atomic read not done by "gcc" as in the generic * version above, is also needed when THP is disabled because the page * fault can populate the pmd from under us). */ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) { pmd_t pmdval = pmd_read_atomic(pmd); /* * The barrier will stabilize the pmdval in a register or on * the stack so that it will stop changing under the code. * * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE, * pmd_read_atomic is allowed to return a not atomic pmdval * (for example pointing to an hugepage that has never been * mapped in the pmd). The below checks will only care about * the low part of the pmd with 32bit PAE x86 anyway, with the * exception of pmd_none(). So the important thing is that if * the low part of the pmd is found null, the high part will * be also null or the pmd_none() check below would be * confused. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE barrier(); #endif /* * !pmd_present() checks for pmd migration entries * * The complete check uses is_pmd_migration_entry() in linux/swapops.h * But using that requires moving current function and pmd_trans_unstable() * to linux/swapops.h to resovle dependency, which is too much code move. * * !pmd_present() is equivalent to is_pmd_migration_entry() currently, * because !pmd_present() pages can only be under migration not swapped * out. * * pmd_none() is preseved for future condition checks on pmd migration * entries and not confusing with this function name, although it is * redundant with !pmd_present(). */ if (pmd_none(pmdval) || pmd_trans_huge(pmdval) || (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval))) return 1; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); return 1; } return 0; } /* * This is a noop if Transparent Hugepage Support is not built into * the kernel. Otherwise it is equivalent to * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in * places that already verified the pmd is not none and they want to * walk ptes while holding the mmap sem in read mode (write mode don't * need this). If THP is not enabled, the pmd can't go away under the * code even if MADV_DONTNEED runs, but if THP is enabled we need to * run a pmd_trans_unstable before walking the ptes after * split_huge_pmd returns (because it may have run when the pmd become * null, but then a page fault can map in a THP and not a regular page). */ static inline int pmd_trans_unstable(pmd_t *pmd) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return pmd_none_or_trans_huge_or_clear_bad(pmd); #else return 0; #endif } #ifndef CONFIG_NUMA_BALANCING /* * Technically a PTE can be PROTNONE even when not doing NUMA balancing but * the only case the kernel cares is for NUMA balancing and is only ever set * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked * _PAGE_PROTNONE so by default, implement the helper as "always no". It * is the responsibility of the caller to distinguish between PROT_NONE * protections and NUMA hinting fault protections. */ static inline int pte_protnone(pte_t pte) { return 0; } static inline int pmd_protnone(pmd_t pmd) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); int p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int p4d_clear_huge(p4d_t *p4d) { return 0; } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int p4d_clear_huge(p4d_t *p4d) { return 0; } static inline int pud_clear_huge(pud_t *pud) { return 0; } static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) { return 0; } static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; } static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. Stock flush_tlb_range() typically has optimization to nuke the * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single THP flush will * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() #define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); #ifndef CONFIG_X86_ESPFIX64 static inline void init_espfix_bsp(void) { } #endif extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { return true; } static inline bool arch_has_pfn_modify_check(void) { return false; } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ /* * Architecture PAGE_KERNEL_* fallbacks * * Some architectures don't define certain PAGE_KERNEL_* flags. This is either * because they really don't support them, or the port needs to be updated to * reflect the required functionality. Below are a set of relatively safe * fallbacks, as best effort, which we can count on in lieu of the architectures * not defining them on their own yet. */ #ifndef PAGE_KERNEL_RO # define PAGE_KERNEL_RO PAGE_KERNEL #endif #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif /* * Page Table Modification bits for pgtbl_mod_mask. * * These are used by the p?d_alloc_track*() set of functions an in the generic * vmalloc/ioremap code to track at which page-table levels entries have been * modified. Based on that the code can better decide when vmalloc and ioremap * mapping changes need to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 #define __PGTBL_PUD_MODIFIED 2 #define __PGTBL_PMD_MODIFIED 3 #define __PGTBL_PTE_MODIFIED 4 #define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) #define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) #define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) #define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) #define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) #ifdef CONFIG_PHYS_ADDR_T_64BIT /* * ZSMALLOC needs to know the highest PFN on 32-bit architectures * with physical address space extension, but falls back to * BITS_PER_LONG otherwise. */ #error Missing MAX_POSSIBLE_PHYSMEM_BITS definition #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif #endif #ifndef has_transparent_hugepage #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define has_transparent_hugepage() 1 #else #define has_transparent_hugepage() 0 #endif #endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. */ #ifndef mm_p4d_folded #define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED) #endif #ifndef mm_pud_folded #define mm_pud_folded(mm) __is_defined(__PAGETABLE_PUD_FOLDED) #endif #ifndef mm_pmd_folded #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif #ifndef p4d_offset_lockless #define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) #endif #ifndef pud_offset_lockless #define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) #endif #ifndef pmd_offset_lockless #define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) #endif /* * p?d_leaf() - true if this entry is a final mapping to a physical address. * This differs from p?d_huge() by the fact that they are always available (if * the architecture supports large pages at the appropriate level) even * if CONFIG_HUGETLB_PAGE is not defined. * Only meaningful when called on a valid entry. */ #ifndef pgd_leaf #define pgd_leaf(x) 0 #endif #ifndef p4d_leaf #define p4d_leaf(x) 0 #endif #ifndef pud_leaf #define pud_leaf(x) 0 #endif #ifndef pmd_leaf #define pmd_leaf(x) 0 #endif #endif /* _LINUX_PGTABLE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _ASM_X86_APIC_H #define _ASM_X86_APIC_H #include <linux/cpumask.h> #include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/apicdef.h> #include <linux/atomic.h> #include <asm/fixmap.h> #include <asm/mpspec.h> #include <asm/msr.h> #include <asm/hardirq.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 /* * Debugging macros */ #define APIC_QUIET 0 #define APIC_VERBOSE 1 #define APIC_DEBUG 2 /* Macros for apic_extnmi which controls external NMI masking */ #define APIC_EXTNMI_BSP 0 /* Default */ #define APIC_EXTNMI_ALL 1 #define APIC_EXTNMI_NONE 2 /* * Define the default level of output to be very little * This can be turned up by using apic=verbose for more * information and apic=debug for _lots_ of information. * apic_verbosity is defined in apic.c */ #define apic_printk(v, s, a...) do { \ if ((v) <= apic_verbosity) \ printk(s, ##a); \ } while (0) #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) extern void generic_apic_probe(void); #else static inline void generic_apic_probe(void) { } #endif #ifdef CONFIG_X86_LOCAL_APIC extern int apic_verbosity; extern int local_apic_timer_c2_ok; extern int disable_apic; extern unsigned int lapic_timer_period; extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { APIC_PIC, APIC_VIRTUAL_WIRE, APIC_VIRTUAL_WIRE_NO_CONFIG, APIC_SYMMETRIC_IO, APIC_SYMMETRIC_IO_NO_ROUTING }; #ifdef CONFIG_SMP extern void __inquire_remote_apic(int apicid); #else /* CONFIG_SMP */ static inline void __inquire_remote_apic(int apicid) { } #endif /* CONFIG_SMP */ static inline void default_inquire_remote_apic(int apicid) { if (apic_verbosity >= APIC_DEBUG) __inquire_remote_apic(apicid); } /* * With 82489DX we can't rely on apic feature bit * retrieved via cpuid but still have to deal with * such an apic chip so we assume that SMP configuration * is found from MP table (64bit case uses ACPI mostly * which set smp presence flag as well so we are safe * to use this helper too). */ static inline bool apic_from_smp_config(void) { return smp_found_config && !disable_apic; } /* * Basic functions accessing APICs. */ #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif extern int setup_profiling_timer(unsigned int); static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, ASM_OUTPUT2("=r" (v), "=m" (*addr)), ASM_OUTPUT2("0" (v), "m" (*addr))); } static inline u32 native_apic_mem_read(u32 reg) { return *((volatile u32 *)(APIC_BASE + reg)); } extern void native_apic_wait_icr_idle(void); extern u32 native_safe_apic_wait_icr_idle(void); extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); static inline bool apic_is_x2apic_enabled(void) { u64 msr; if (rdmsrl_safe(MSR_IA32_APICBASE, &msr)) return false; return msr & X2APIC_ENABLE; } extern void enable_IR_x2apic(void); extern int get_physical_broadcast(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void apic_soft_disable(void); extern void lapic_shutdown(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); extern void apic_intr_mode_select(void); extern void apic_intr_mode_init(void); extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern void lapic_update_tsc_freq(void); #ifdef CONFIG_X86_64 static inline int apic_force_enable(unsigned long addr) { return -1; } #else extern int apic_force_enable(unsigned long addr); #endif extern void apic_ap_setup(void); /* * On 32bit this is mach-xxx local */ #ifdef CONFIG_X86_64 extern int apic_is_clustered_box(void); #else static inline int apic_is_clustered_box(void) { return 0; } #endif extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); extern void lapic_assign_system_vectors(void); extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace); extern void lapic_update_legacy_vectors(void); extern void lapic_online(void); extern void lapic_offline(void); extern bool apic_needs_pit(void); extern void apic_send_IPI_allbutself(unsigned int vector); #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop static inline void lapic_update_tsc_freq(void) { } static inline void init_bsp_APIC(void) { } static inline void apic_intr_mode_select(void) { } static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } static inline bool apic_needs_pit(void) { return true; } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC static inline void native_apic_msr_write(u32 reg, u32 v) { if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || reg == APIC_LVR) return; wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0); } static inline void native_apic_msr_eoi_write(u32 reg, u32 v) { __wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); } static inline u32 native_apic_msr_read(u32 reg) { u64 msr; if (reg == APIC_DFR) return -1; rdmsrl(APIC_BASE_MSR + (reg >> 4), msr); return (u32)msr; } static inline void native_x2apic_wait_icr_idle(void) { /* no need to wait for icr idle in x2apic */ return; } static inline u32 native_safe_x2apic_wait_icr_idle(void) { /* no need to wait for icr idle in x2apic */ return 0; } static inline void native_x2apic_icr_write(u32 low, u32 id) { wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); } static inline u64 native_x2apic_icr_read(void) { unsigned long val; rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); return val; } extern int x2apic_mode; extern int x2apic_phys; extern void __init x2apic_set_max_apicid(u32 apicid); extern void __init check_x2apic(void); extern void x2apic_setup(void); static inline int x2apic_enabled(void) { return boot_cpu_has(X86_FEATURE_X2APIC) && apic_is_x2apic_enabled(); } #define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC)) #else /* !CONFIG_X86_X2APIC */ static inline void check_x2apic(void) { } static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } #define x2apic_mode (0) #define x2apic_supported() (0) #endif /* !CONFIG_X86_X2APIC */ struct irq_data; /* * Copyright 2004 James Cleverdon, IBM. * * Generic APIC sub-arch data struct. * * Hacked for x86-64 by James Cleverdon from i386 architecture code by * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ struct apic { /* Hotpath functions first */ void (*eoi_write)(u32 reg, u32 v); void (*native_eoi_write)(u32 reg, u32 v); void (*write)(u32 reg, u32 v); u32 (*read)(u32 reg); /* IPI related functions */ void (*wait_icr_idle)(void); u32 (*safe_wait_icr_idle)(void); void (*send_IPI)(int cpu, int vector); void (*send_IPI_mask)(const struct cpumask *mask, int vector); void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); /* dest_logical is used by the IPI functions */ u32 dest_logical; u32 disable_esr; u32 irq_delivery_mode; u32 irq_dest_mode; u32 (*calc_dest_apicid)(unsigned int cpu); /* ICR related functions */ u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); /* Probe, setup and smpboot functions */ int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); int (*apic_id_valid)(u32 apicid); int (*apic_id_registered)(void); bool (*check_apicid_used)(physid_mask_t *map, int apicid); void (*init_apic_ldr)(void); void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); void (*setup_apic_routing)(void); int (*cpu_present_to_apicid)(int mps_cpu); void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); int (*check_phys_apicid_present)(int phys_apicid); int (*phys_pkg_id)(int cpuid_apic, int index_msb); u32 (*get_apic_id)(unsigned long x); u32 (*set_apic_id)(unsigned int id); /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); void (*inquire_remote_apic)(int apicid); #ifdef CONFIG_X86_32 /* * Called very early during boot from get_smp_config(). It should * return the logical apicid. x86_[bios]_cpu_to_apicid is * initialized before this function is called. * * If logical apicid can't be determined that early, the function * may return BAD_APICID. Logical apicid will be configured after * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity * won't be applied properly during early boot in this case. */ int (*x86_32_early_logical_apicid)(int cpu); #endif char *name; }; /* * Pointer to the local APIC driver in use on this system (there's * always just one such driver in use - the kernel decides via an * early probing process which one it picks - and then sticks to it): */ extern struct apic *apic; /* * APIC drivers are probed based on how they are listed in the .apicdrivers * section. So the order is important and enforced by the ordering * of different apic driver files in the Makefile. * * For the files having two apic drivers, we use apic_drivers() * to enforce the order with in them. */ #define apic_driver(sym) \ static const struct apic *__apicdrivers_##sym __used \ __aligned(sizeof(struct apic *)) \ __section(".apicdrivers") = { &sym } #define apic_drivers(sym1, sym2) \ static struct apic *__apicdrivers_##sym1##sym2[2] __used \ __aligned(sizeof(struct apic *)) \ __section(".apicdrivers") = { &sym1, &sym2 } extern struct apic *__apicdrivers[], *__apicdrivers_end[]; /* * APIC functionality to boot other CPUs - only used on SMP: */ #ifdef CONFIG_SMP extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); extern int lapic_can_unplug_cpu(void); #endif #ifdef CONFIG_X86_LOCAL_APIC static inline u32 apic_read(u32 reg) { return apic->read(reg); } static inline void apic_write(u32 reg, u32 val) { apic->write(reg, val); } static inline void apic_eoi(void) { apic->eoi_write(APIC_EOI, APIC_EOI_ACK); } static inline u64 apic_icr_read(void) { return apic->icr_read(); } static inline void apic_icr_write(u32 low, u32 high) { apic->icr_write(low, high); } static inline void apic_wait_icr_idle(void) { apic->wait_icr_idle(); } static inline u32 safe_apic_wait_icr_idle(void) { return apic->safe_wait_icr_idle(); } extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 apic_read(u32 reg) { return 0; } static inline void apic_write(u32 reg, u32 val) { } static inline void apic_eoi(void) { } static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} #endif /* CONFIG_X86_LOCAL_APIC */ extern void apic_ack_irq(struct irq_data *data); static inline void ack_APIC_irq(void) { /* * ack_APIC_irq() actually gets compiled as a single instruction * ... yummie. */ apic_eoi(); } static inline bool lapic_vector_set_in_irr(unsigned int vector) { u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); return !!(irr & (1U << (vector % 32))); } static inline unsigned default_get_apic_id(unsigned long x) { unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID)) return (x >> 24) & 0xFF; else return (x >> 24) & 0x0F; } /* * Warm reset vector position: */ #define TRAMPOLINE_PHYS_LOW 0x467 #define TRAMPOLINE_PHYS_HIGH 0x469 extern void generic_bigsmp_probe(void); #ifdef CONFIG_X86_LOCAL_APIC #include <asm/smp.h> #define APIC_DFR_VALUE (APIC_DFR_FLAT) DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); extern struct apic apic_noop; static inline unsigned int read_apic_id(void) { unsigned int reg = apic_read(APIC_ID); return apic->get_apic_id(reg); } extern int default_apic_id_valid(u32 apicid); extern int default_acpi_madt_oem_check(char *, char *); extern void default_setup_apic_routing(void); extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); extern bool default_check_apicid_used(physid_mask_t *map, int apicid); extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); extern int default_cpu_present_to_apicid(int mps_cpu); extern int default_check_phys_apicid_present(int phys_apicid); #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_SMP bool apic_id_is_primary_thread(unsigned int id); void apic_smt_update(void); #else static inline bool apic_id_is_primary_thread(unsigned int id) { return false; } static inline void apic_smt_update(void) { } #endif struct msi_msg; #ifdef CONFIG_PCI_MSI void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg); #else # define x86_vector_msi_compose_msg NULL #endif extern void ioapic_zap_locks(void); #endif /* _ASM_X86_APIC_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_FLOW_DISSECTOR_H #define _NET_FLOW_DISSECTOR_H #include <linux/types.h> #include <linux/in6.h> #include <linux/siphash.h> #include <linux/string.h> #include <uapi/linux/if_ether.h> struct bpf_prog; struct net; struct sk_buff; /** * struct flow_dissector_key_control: * @thoff: Transport header offset */ struct flow_dissector_key_control { u16 thoff; u16 addr_type; u32 flags; }; #define FLOW_DIS_IS_FRAGMENT BIT(0) #define FLOW_DIS_FIRST_FRAG BIT(1) #define FLOW_DIS_ENCAPSULATION BIT(2) enum flow_dissect_ret { FLOW_DISSECT_RET_OUT_GOOD, FLOW_DISSECT_RET_OUT_BAD, FLOW_DISSECT_RET_PROTO_AGAIN, FLOW_DISSECT_RET_IPPROTO_AGAIN, FLOW_DISSECT_RET_CONTINUE, }; /** * struct flow_dissector_key_basic: * @n_proto: Network header protocol (eg. IPv4/IPv6) * @ip_proto: Transport header protocol (eg. TCP/UDP) */ struct flow_dissector_key_basic { __be16 n_proto; u8 ip_proto; u8 padding; }; struct flow_dissector_key_tags { u32 flow_label; }; struct flow_dissector_key_vlan { union { struct { u16 vlan_id:12, vlan_dei:1, vlan_priority:3; }; __be16 vlan_tci; }; __be16 vlan_tpid; }; struct flow_dissector_mpls_lse { u32 mpls_ttl:8, mpls_bos:1, mpls_tc:3, mpls_label:20; }; #define FLOW_DIS_MPLS_MAX 7 struct flow_dissector_key_mpls { struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */ u8 used_lses; /* One bit set for each Label Stack Entry in use */ }; static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls, int lse_index) { mpls->used_lses |= 1 << lse_index; } #define FLOW_DIS_TUN_OPTS_MAX 255 /** * struct flow_dissector_key_enc_opts: * @data: tunnel option data * @len: length of tunnel option data * @dst_opt_type: tunnel option type */ struct flow_dissector_key_enc_opts { u8 data[FLOW_DIS_TUN_OPTS_MAX]; /* Using IP_TUNNEL_OPTS_MAX is desired * here but seems difficult to #include */ u8 len; __be16 dst_opt_type; }; struct flow_dissector_key_keyid { __be32 keyid; }; /** * struct flow_dissector_key_ipv4_addrs: * @src: source ip address * @dst: destination ip address */ struct flow_dissector_key_ipv4_addrs { /* (src,dst) must be grouped, in the same way than in IP header */ __be32 src; __be32 dst; }; /** * struct flow_dissector_key_ipv6_addrs: * @src: source ip address * @dst: destination ip address */ struct flow_dissector_key_ipv6_addrs { /* (src,dst) must be grouped, in the same way than in IP header */ struct in6_addr src; struct in6_addr dst; }; /** * struct flow_dissector_key_tipc: * @key: source node address combined with selector */ struct flow_dissector_key_tipc { __be32 key; }; /** * struct flow_dissector_key_addrs: * @v4addrs: IPv4 addresses * @v6addrs: IPv6 addresses */ struct flow_dissector_key_addrs { union { struct flow_dissector_key_ipv4_addrs v4addrs; struct flow_dissector_key_ipv6_addrs v6addrs; struct flow_dissector_key_tipc tipckey; }; }; /** * flow_dissector_key_arp: * @ports: Operation, source and target addresses for an ARP header * for Ethernet hardware addresses and IPv4 protocol addresses * sip: Sender IP address * tip: Target IP address * op: Operation * sha: Sender hardware address * tpa: Target hardware address */ struct flow_dissector_key_arp { __u32 sip; __u32 tip; __u8 op; unsigned char sha[ETH_ALEN]; unsigned char tha[ETH_ALEN]; }; /** * flow_dissector_key_tp_ports: * @ports: port numbers of Transport header * src: source port number * dst: destination port number */ struct flow_dissector_key_ports { union { __be32 ports; struct { __be16 src; __be16 dst; }; }; }; /** * flow_dissector_key_icmp: * type: ICMP type * code: ICMP code * id: session identifier */ struct flow_dissector_key_icmp { struct { u8 type; u8 code; }; u16 id; }; /** * struct flow_dissector_key_eth_addrs: * @src: source Ethernet address * @dst: destination Ethernet address */ struct flow_dissector_key_eth_addrs { /* (dst,src) must be grouped, in the same way than in ETH header */ unsigned char dst[ETH_ALEN]; unsigned char src[ETH_ALEN]; }; /** * struct flow_dissector_key_tcp: * @flags: flags */ struct flow_dissector_key_tcp { __be16 flags; }; /** * struct flow_dissector_key_ip: * @tos: tos * @ttl: ttl */ struct flow_dissector_key_ip { __u8 tos; __u8 ttl; }; /** * struct flow_dissector_key_meta: * @ingress_ifindex: ingress ifindex * @ingress_iftype: ingress interface type */ struct flow_dissector_key_meta { int ingress_ifindex; u16 ingress_iftype; }; /** * struct flow_dissector_key_ct: * @ct_state: conntrack state after converting with map * @ct_mark: conttrack mark * @ct_zone: conntrack zone * @ct_labels: conntrack labels */ struct flow_dissector_key_ct { u16 ct_state; u16 ct_zone; u32 ct_mark; u32 ct_labels[4]; }; /** * struct flow_dissector_key_hash: * @hash: hash value */ struct flow_dissector_key_hash { u32 hash; }; enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_PORTS_RANGE, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */ FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */ FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_vlan */ FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_tags */ FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */ FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_vlan */ FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */ FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */ FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */ FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */ FLOW_DISSECTOR_KEY_MAX, }; #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(1) #define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(2) struct flow_dissector_key { enum flow_dissector_key_id key_id; size_t offset; /* offset of struct flow_dissector_key_* in target the struct */ }; struct flow_dissector { unsigned int used_keys; /* each bit repesents presence of one key id */ unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; struct flow_keys_basic { struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; }; struct flow_keys { struct flow_dissector_key_control control; #define FLOW_KEYS_HASH_START_FIELD basic struct flow_dissector_key_basic basic __aligned(SIPHASH_ALIGNMENT); struct flow_dissector_key_tags tags; struct flow_dissector_key_vlan vlan; struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; struct flow_dissector_key_icmp icmp; /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; #define FLOW_KEYS_HASH_OFFSET \ offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD) __be32 flow_get_u32_src(const struct flow_keys *flow); __be32 flow_get_u32_dst(const struct flow_keys *flow); extern struct flow_dissector flow_keys_dissector; extern struct flow_dissector flow_keys_basic_dissector; /* struct flow_keys_digest: * * This structure is used to hold a digest of the full flow keys. This is a * larger "hash" of a flow to allow definitively matching specific flows where * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so * that it can be used in CB of skb (see sch_choke for an example). */ #define FLOW_KEYS_DIGEST_LEN 16 struct flow_keys_digest { u8 data[FLOW_KEYS_DIGEST_LEN]; }; void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow); static inline bool flow_keys_have_l4(const struct flow_keys *keys) { return (keys->ports.ports || keys->tags.flow_label); } u32 flow_hash_from_keys(struct flow_keys *keys); void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { return flow_dissector->used_keys & (1 << key_id); } static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id, void *target_container) { return ((char *)target_container) + flow_dissector->offset[key_id]; } struct bpf_flow_dissector { struct bpf_flow_keys *flow_keys; const struct sk_buff *skb; void *data; void *data_end; }; static inline void flow_dissector_init_keys(struct flow_dissector_key_control *key_control, struct flow_dissector_key_basic *key_basic) { memset(key_control, 0, sizeof(*key_control)); memset(key_basic, 0, sizeof(*key_basic)); } #ifdef CONFIG_BPF_SYSCALL int flow_dissector_bpf_prog_attach_check(struct net *net, struct bpf_prog *prog); #endif /* CONFIG_BPF_SYSCALL */ #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGALLOC_H #define _ASM_X86_PGALLOC_H #include <linux/threads.h> #include <linux/mm.h> /* for struct page */ #include <linux/pagemap.h> #define __HAVE_ARCH_PTE_ALLOC_ONE #define __HAVE_ARCH_PGD_FREE #include <asm-generic/pgalloc.h> static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm) static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {} static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) {} static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_release_pte(unsigned long pfn) {} static inline void paravirt_release_pmd(unsigned long pfn) {} static inline void paravirt_release_pud(unsigned long pfn) {} static inline void paravirt_release_p4d(unsigned long pfn) {} #endif /* * Flags to use when allocating a user page table page. */ extern gfp_t __userpte_alloc_gfp; #ifdef CONFIG_PAGE_TABLE_ISOLATION /* * Instead of one PGD, we acquire two PGDs. Being order-1, it is * both 8k in size and 8k-aligned. That lets us just flip bit 12 * in a pointer to swap between the two 4k halves. */ #define PGD_ALLOCATION_ORDER 1 #else #define PGD_ALLOCATION_ORDER 0 #endif /* * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); extern pgtable_t pte_alloc_one(struct mm_struct *); extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, unsigned long address) { ___pte_free_tlb(tlb, pte); } static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); } static inline void pmd_populate_kernel_safe(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { unsigned long pfn = page_to_pfn(pte); paravirt_alloc_pte(mm, pfn); set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE)); } #define pmd_pgtable(pmd) pmd_page(pmd) #if CONFIG_PGTABLE_LEVELS > 2 extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long address) { ___pmd_free_tlb(tlb, pmd); } #ifdef CONFIG_X86_PAE extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); #else /* !CONFIG_X86_PAE */ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); } static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd))); } #endif /* CONFIG_X86_PAE */ #if CONFIG_PGTABLE_LEVELS > 3 static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long address) { ___pud_free_tlb(tlb, pud); } #if CONFIG_PGTABLE_LEVELS > 4 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { if (!pgtable_l5_enabled()) return; paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { if (!pgtable_l5_enabled()) return; paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_KERNEL_ACCOUNT; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; return (p4d_t *)get_zeroed_page(gfp); } static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { if (!pgtable_l5_enabled()) return; BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); free_page((unsigned long)p4d); } extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { if (pgtable_l5_enabled()) ___p4d_free_tlb(tlb, p4d); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* _ASM_X86_PGALLOC_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 #ifndef __LINUX_MROUTE_BASE_H #define __LINUX_MROUTE_BASE_H #include <linux/netdevice.h> #include <linux/rhashtable-types.h> #include <linux/spinlock.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> /** * struct vif_device - interface representor for multicast routing * @dev: network device being used * @bytes_in: statistic; bytes ingressing * @bytes_out: statistic; bytes egresing * @pkt_in: statistic; packets ingressing * @pkt_out: statistic; packets egressing * @rate_limit: Traffic shaping (NI) * @threshold: TTL threshold * @flags: Control flags * @link: Physical interface index * @dev_parent_id: device parent id * @local: Local address * @remote: Remote address for tunnels */ struct vif_device { struct net_device *dev; unsigned long bytes_in, bytes_out; unsigned long pkt_in, pkt_out; unsigned long rate_limit; unsigned char threshold; unsigned short flags; int link; /* Currently only used by ipmr */ struct netdev_phys_item_id dev_parent_id; __be32 local, remote; }; struct vif_entry_notifier_info { struct fib_notifier_info info; struct net_device *dev; unsigned short vif_index; unsigned short vif_flags; u32 tb_id; }; static inline int mr_call_vif_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, unsigned short vif_index, u32 tb_id, struct netlink_ext_ack *extack) { struct vif_entry_notifier_info info = { .info = { .family = family, .extack = extack, }, .dev = vif->dev, .vif_index = vif_index, .vif_flags = vif->flags, .tb_id = tb_id, }; return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_vif_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, unsigned short vif_index, u32 tb_id, unsigned int *ipmr_seq) { struct vif_entry_notifier_info info = { .info = { .family = family, }, .dev = vif->dev, .vif_index = vif_index, .vif_flags = vif->flags, .tb_id = tb_id, }; ASSERT_RTNL(); (*ipmr_seq)++; return call_fib_notifiers(net, event_type, &info.info); } #ifndef MAXVIFS /* This one is nasty; value is defined in uapi using different symbols for * mroute and morute6 but both map into same 32. */ #define MAXVIFS 32 #endif #define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev)) /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) * MFC_OFFLOAD - the entry was offloaded to the hardware */ enum { MFC_STATIC = BIT(0), MFC_OFFLOAD = BIT(1), }; /** * struct mr_mfc - common multicast routing entries * @mnode: rhashtable list * @mfc_parent: source interface (iif) * @mfc_flags: entry flags * @expires: unresolved entry expire time * @unresolved: unresolved cached skbs * @last_assert: time of last assert * @minvif: minimum VIF id * @maxvif: maximum VIF id * @bytes: bytes that have passed for this entry * @pkt: packets that have passed for this entry * @wrong_if: number of wrong source interface hits * @lastuse: time of last use of the group (traffic or update) * @ttls: OIF TTL threshold array * @refcount: reference count for this entry * @list: global entry list * @rcu: used for entry destruction * @free: Operation used for freeing an entry under RCU */ struct mr_mfc { struct rhlist_head mnode; unsigned short mfc_parent; int mfc_flags; union { struct { unsigned long expires; struct sk_buff_head unresolved; } unres; struct { unsigned long last_assert; int minvif; int maxvif; unsigned long bytes; unsigned long pkt; unsigned long wrong_if; unsigned long lastuse; unsigned char ttls[MAXVIFS]; refcount_t refcount; } res; } mfc_un; struct list_head list; struct rcu_head rcu; void (*free)(struct rcu_head *head); }; static inline void mr_cache_put(struct mr_mfc *c) { if (refcount_dec_and_test(&c->mfc_un.res.refcount)) call_rcu(&c->rcu, c->free); } static inline void mr_cache_hold(struct mr_mfc *c) { refcount_inc(&c->mfc_un.res.refcount); } struct mfc_entry_notifier_info { struct fib_notifier_info info; struct mr_mfc *mfc; u32 tb_id; }; static inline int mr_call_mfc_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, struct netlink_ext_ack *extack) { struct mfc_entry_notifier_info info = { .info = { .family = family, .extack = extack, }, .mfc = mfc, .tb_id = tb_id }; return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_mfc_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, unsigned int *ipmr_seq) { struct mfc_entry_notifier_info info = { .info = { .family = family, }, .mfc = mfc, .tb_id = tb_id }; ASSERT_RTNL(); (*ipmr_seq)++; return call_fib_notifiers(net, event_type, &info.info); } struct mr_table; /** * struct mr_table_ops - callbacks and info for protocol-specific ops * @rht_params: parameters for accessing the MFC hash * @cmparg_any: a hash key to be used for matching on (*,*) routes */ struct mr_table_ops { const struct rhashtable_params *rht_params; void *cmparg_any; }; /** * struct mr_table - a multicast routing table * @list: entry within a list of multicast routing tables * @net: net where this table belongs * @ops: protocol specific operations * @id: identifier of the table * @mroute_sk: socket associated with the table * @ipmr_expire_timer: timer for handling unresolved routes * @mfc_unres_queue: list of unresolved MFC entries * @vif_table: array containing all possible vifs * @mfc_hash: Hash table of all resolved routes for easy lookup * @mfc_cache_list: list of resovled routes for possible traversal * @maxvif: Identifier of highest value vif currently in use * @cache_resolve_queue_len: current size of unresolved queue * @mroute_do_assert: Whether to inform userspace on wrong ingress * @mroute_do_pim: Whether to receive IGMP PIMv1 * @mroute_reg_vif_num: PIM-device vif index */ struct mr_table { struct list_head list; possible_net_t net; struct mr_table_ops ops; u32 id; struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; struct list_head mfc_unres_queue; struct vif_device vif_table[MAXVIFS]; struct rhltable mfc_hash; struct list_head mfc_cache_list; int maxvif; atomic_t cache_resolve_queue_len; bool mroute_do_assert; bool mroute_do_pim; bool mroute_do_wrvifwhole; int mroute_reg_vif_num; }; #ifdef CONFIG_IP_MROUTE_COMMON void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)); /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent); void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi); void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg); int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm); int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, struct netlink_callback *cb, int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter); int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), rwlock_t *mrt_lock, struct netlink_ext_ack *extack); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask) { } static inline void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) { return NULL; } static inline void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi) { return NULL; } static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) { return NULL; } static inline int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm) { return -EINVAL; } static inline int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter) { return -EINVAL; } static inline int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), rwlock_t *mrt_lock, struct netlink_ext_ack *extack) { return -EINVAL; } #endif static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) { return mr_mfc_find_parent(mrt, hasharg, -1); } #ifdef CONFIG_PROC_FS struct mr_vif_iter { struct seq_net_private p; struct mr_table *mrt; int ct; }; struct mr_mfc_iter { struct seq_net_private p; struct mr_table *mrt; struct list_head *cache; /* Lock protecting the mr_table's unresolved queue */ spinlock_t *lock; }; #ifdef CONFIG_IP_MROUTE_COMMON void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos); void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos); static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) { return *pos ? mr_vif_seq_idx(seq_file_net(seq), seq->private, *pos - 1) : SEQ_START_TOKEN; } /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos); void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos); static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, struct mr_table *mrt, spinlock_t *lock) { struct mr_mfc_iter *it = seq->private; it->mrt = mrt; it->cache = NULL; it->lock = lock; return *pos ? mr_mfc_seq_idx(seq_file_net(seq), seq->private, *pos - 1) : SEQ_START_TOKEN; } static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) { struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(it->lock); else if (it->cache == &mrt->mfc_cache_list) rcu_read_unlock(); } #else static inline void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos) { return NULL; } static inline void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return NULL; } static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) { return NULL; } static inline void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos) { return NULL; } static inline void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return NULL; } static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, struct mr_table *mrt, spinlock_t *lock) { return NULL; } static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) { } #endif #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct skb_array' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * Limited-size FIFO of skbs. Can be used more or less whenever * sk_buff_head can be used, except you need to know the queue size in * advance. * Implemented as a type-safe wrapper around ptr_ring. */ #ifndef _LINUX_SKB_ARRAY_H #define _LINUX_SKB_ARRAY_H 1 #ifdef __KERNEL__ #include <linux/ptr_ring.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #endif struct skb_array { struct ptr_ring ring; }; /* Might be slightly faster than skb_array_full below, but callers invoking * this in a loop must use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_full(struct skb_array *a) { return __ptr_ring_full(&a->ring); } static inline bool skb_array_full(struct skb_array *a) { return ptr_ring_full(&a->ring); } static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce(&a->ring, skb); } static inline int skb_array_produce_irq(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_irq(&a->ring, skb); } static inline int skb_array_produce_bh(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_bh(&a->ring, skb); } static inline int skb_array_produce_any(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_any(&a->ring, skb); } /* Might be slightly faster than skb_array_empty below, but only safe if the * array is never resized. Also, callers invoking this in a loop must take care * to use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_empty(struct skb_array *a) { return __ptr_ring_empty(&a->ring); } static inline struct sk_buff *__skb_array_peek(struct skb_array *a) { return __ptr_ring_peek(&a->ring); } static inline bool skb_array_empty(struct skb_array *a) { return ptr_ring_empty(&a->ring); } static inline bool skb_array_empty_bh(struct skb_array *a) { return ptr_ring_empty_bh(&a->ring); } static inline bool skb_array_empty_irq(struct skb_array *a) { return ptr_ring_empty_irq(&a->ring); } static inline bool skb_array_empty_any(struct skb_array *a) { return ptr_ring_empty_any(&a->ring); } static inline struct sk_buff *__skb_array_consume(struct skb_array *a) { return __ptr_ring_consume(&a->ring); } static inline struct sk_buff *skb_array_consume(struct skb_array *a) { return ptr_ring_consume(&a->ring); } static inline int skb_array_consume_batched(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a) { return ptr_ring_consume_irq(&a->ring); } static inline int skb_array_consume_batched_irq(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_any(struct skb_array *a) { return ptr_ring_consume_any(&a->ring); } static inline int skb_array_consume_batched_any(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_any(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a) { return ptr_ring_consume_bh(&a->ring); } static inline int skb_array_consume_batched_bh(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n); } static inline int __skb_array_len_with_tag(struct sk_buff *skb) { if (likely(skb)) { int len = skb->len; if (skb_vlan_tag_present(skb)) len += VLAN_HLEN; return len; } else { return 0; } } static inline int skb_array_peek_len(struct skb_array *a) { return PTR_RING_PEEK_CALL(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_irq(struct skb_array *a) { return PTR_RING_PEEK_CALL_IRQ(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_bh(struct skb_array *a) { return PTR_RING_PEEK_CALL_BH(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_any(struct skb_array *a) { return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_init(&a->ring, size, gfp); } static void __skb_array_destroy_skb(void *ptr) { kfree_skb(ptr); } static inline void skb_array_unconsume(struct skb_array *a, struct sk_buff **skbs, int n) { ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb); } static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } static inline int skb_array_resize_multiple(struct skb_array **rings, int nrings, unsigned int size, gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); return ptr_ring_resize_multiple((struct ptr_ring **)rings, nrings, size, gfp, __skb_array_destroy_skb); } static inline void skb_array_cleanup(struct skb_array *a) { ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb); } #endif /* _LINUX_SKB_ARRAY_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H #ifdef CONFIG_HUGETLB_PAGE #include <linux/mm.h> static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return !!(vma->vm_flags & VM_HUGETLB); } #else static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return false; } #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Driver for 8250/16550-type serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * * Copyright (C) 2001 Russell King. */ #include <linux/serial_8250.h> #include <linux/serial_reg.h> #include <linux/dmaengine.h> #include "../serial_mctrl_gpio.h" struct uart_8250_dma { int (*tx_dma)(struct uart_8250_port *p); int (*rx_dma)(struct uart_8250_port *p); /* Filter function */ dma_filter_fn fn; /* Parameter to the filter function */ void *rx_param; void *tx_param; struct dma_slave_config rxconf; struct dma_slave_config txconf; struct dma_chan *rxchan; struct dma_chan *txchan; /* Device address base for DMA operations */ phys_addr_t rx_dma_addr; phys_addr_t tx_dma_addr; /* DMA address of the buffer in memory */ dma_addr_t rx_addr; dma_addr_t tx_addr; dma_cookie_t rx_cookie; dma_cookie_t tx_cookie; void *rx_buf; size_t rx_size; size_t tx_size; unsigned char tx_running; unsigned char tx_err; unsigned char rx_running; }; struct old_serial_port { unsigned int uart; unsigned int baud_base; unsigned int port; unsigned int irq; upf_t flags; unsigned char io_type; unsigned char __iomem *iomem_base; unsigned short iomem_reg_shift; }; struct serial8250_config { const char *name; unsigned short fifo_size; unsigned short tx_loadsz; unsigned char fcr; unsigned char rxtrig_bytes[UART_FCR_R_TRIG_MAX_STATE]; unsigned int flags; }; #define UART_CAP_FIFO (1 << 8) /* UART has FIFO */ #define UART_CAP_EFR (1 << 9) /* UART has EFR */ #define UART_CAP_SLEEP (1 << 10) /* UART has IER sleep */ #define UART_CAP_AFE (1 << 11) /* MCR-based hw flow control */ #define UART_CAP_UUE (1 << 12) /* UART needs IER bit 6 set (Xscale) */ #define UART_CAP_RTOIE (1 << 13) /* UART needs IER bit 4 set (Xscale, Tegra) */ #define UART_CAP_HFIFO (1 << 14) /* UART has a "hidden" FIFO */ #define UART_CAP_RPM (1 << 15) /* Runtime PM is active while idle */ #define UART_CAP_IRDA (1 << 16) /* UART supports IrDA line discipline */ #define UART_CAP_MINI (1 << 17) /* Mini UART on BCM283X family lacks: * STOP PARITY EPAR SPAR WLEN5 WLEN6 */ #define UART_BUG_QUOT (1 << 0) /* UART has buggy quot LSB */ #define UART_BUG_TXEN (1 << 1) /* UART has buggy TX IIR status */ #define UART_BUG_NOMSR (1 << 2) /* UART has buggy MSR status bits (Au1x00) */ #define UART_BUG_THRE (1 << 3) /* UART has buggy THRE reassertion */ #define UART_BUG_PARITY (1 << 4) /* UART mishandles parity if FIFO enabled */ #define UART_BUG_TXRACE (1 << 5) /* UART Tx fails to set remote DR */ #ifdef CONFIG_SERIAL_8250_SHARE_IRQ #define SERIAL8250_SHARE_IRQS 1 #else #define SERIAL8250_SHARE_IRQS 0 #endif #define SERIAL8250_PORT_FLAGS(_base, _irq, _flags) \ { \ .iobase = _base, \ .irq = _irq, \ .uartclk = 1843200, \ .iotype = UPIO_PORT, \ .flags = UPF_BOOT_AUTOCONF | (_flags), \ } #define SERIAL8250_PORT(_base, _irq) SERIAL8250_PORT_FLAGS(_base, _irq, 0) static inline int serial_in(struct uart_8250_port *up, int offset) { return up->port.serial_in(&up->port, offset); } static inline void serial_out(struct uart_8250_port *up, int offset, int value) { up->port.serial_out(&up->port, offset, value); } void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p); static inline int serial_dl_read(struct uart_8250_port *up) { return up->dl_read(up); } static inline void serial_dl_write(struct uart_8250_port *up, int value) { up->dl_write(up, value); } static inline bool serial8250_set_THRI(struct uart_8250_port *up) { if (up->ier & UART_IER_THRI) return false; up->ier |= UART_IER_THRI; serial_out(up, UART_IER, up->ier); return true; } static inline bool serial8250_clear_THRI(struct uart_8250_port *up) { if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; serial_out(up, UART_IER, up->ier); return true; } struct uart_8250_port *serial8250_get_port(int line); void serial8250_rpm_get(struct uart_8250_port *p); void serial8250_rpm_put(struct uart_8250_port *p); void serial8250_rpm_get_tx(struct uart_8250_port *p); void serial8250_rpm_put_tx(struct uart_8250_port *p); int serial8250_em485_config(struct uart_port *port, struct serial_rs485 *rs485); void serial8250_em485_start_tx(struct uart_8250_port *p); void serial8250_em485_stop_tx(struct uart_8250_port *p); void serial8250_em485_destroy(struct uart_8250_port *p); /* MCR <-> TIOCM conversion */ static inline int serial8250_TIOCM_to_MCR(int tiocm) { int mcr = 0; if (tiocm & TIOCM_RTS) mcr |= UART_MCR_RTS; if (tiocm & TIOCM_DTR) mcr |= UART_MCR_DTR; if (tiocm & TIOCM_OUT1) mcr |= UART_MCR_OUT1; if (tiocm & TIOCM_OUT2) mcr |= UART_MCR_OUT2; if (tiocm & TIOCM_LOOP) mcr |= UART_MCR_LOOP; return mcr; } static inline int serial8250_MCR_to_TIOCM(int mcr) { int tiocm = 0; if (mcr & UART_MCR_RTS) tiocm |= TIOCM_RTS; if (mcr & UART_MCR_DTR) tiocm |= TIOCM_DTR; if (mcr & UART_MCR_OUT1) tiocm |= TIOCM_OUT1; if (mcr & UART_MCR_OUT2) tiocm |= TIOCM_OUT2; if (mcr & UART_MCR_LOOP) tiocm |= TIOCM_LOOP; return tiocm; } /* MSR <-> TIOCM conversion */ static inline int serial8250_MSR_to_TIOCM(int msr) { int tiocm = 0; if (msr & UART_MSR_DCD) tiocm |= TIOCM_CAR; if (msr & UART_MSR_RI) tiocm |= TIOCM_RNG; if (msr & UART_MSR_DSR) tiocm |= TIOCM_DSR; if (msr & UART_MSR_CTS) tiocm |= TIOCM_CTS; return tiocm; } static inline void serial8250_out_MCR(struct uart_8250_port *up, int value) { serial_out(up, UART_MCR, value); if (up->gpios) mctrl_gpio_set(up->gpios, serial8250_MCR_to_TIOCM(value)); } static inline int serial8250_in_MCR(struct uart_8250_port *up) { int mctrl; mctrl = serial_in(up, UART_MCR); if (up->gpios) { unsigned int mctrl_gpio = 0; mctrl_gpio = mctrl_gpio_get_outputs(up->gpios, &mctrl_gpio); mctrl |= serial8250_TIOCM_to_MCR(mctrl_gpio); } return mctrl; } #if defined(__alpha__) && !defined(CONFIG_PCI) /* * Digital did something really horribly wrong with the OUT1 and OUT2 * lines on at least some ALPHA's. The failure mode is that if either * is cleared, the machine locks up with endless interrupts. */ #define ALPHA_KLUDGE_MCR (UART_MCR_OUT2 | UART_MCR_OUT1) #else #define ALPHA_KLUDGE_MCR 0 #endif #ifdef CONFIG_SERIAL_8250_PNP int serial8250_pnp_init(void); void serial8250_pnp_exit(void); #else static inline int serial8250_pnp_init(void) { return 0; } static inline void serial8250_pnp_exit(void) { } #endif #ifdef CONFIG_SERIAL_8250_FINTEK int fintek_8250_probe(struct uart_8250_port *uart); #else static inline int fintek_8250_probe(struct uart_8250_port *uart) { return 0; } #endif #ifdef CONFIG_ARCH_OMAP1 static inline int is_omap1_8250(struct uart_8250_port *pt) { int res; switch (pt->port.mapbase) { case OMAP1_UART1_BASE: case OMAP1_UART2_BASE: case OMAP1_UART3_BASE: res = 1; break; default: res = 0; break; } return res; } static inline int is_omap1510_8250(struct uart_8250_port *pt) { if (!cpu_is_omap1510()) return 0; return is_omap1_8250(pt); } #else static inline int is_omap1_8250(struct uart_8250_port *pt) { return 0; } static inline int is_omap1510_8250(struct uart_8250_port *pt) { return 0; } #endif #ifdef CONFIG_SERIAL_8250_DMA extern int serial8250_tx_dma(struct uart_8250_port *); extern int serial8250_rx_dma(struct uart_8250_port *); extern void serial8250_rx_dma_flush(struct uart_8250_port *); extern int serial8250_request_dma(struct uart_8250_port *); extern void serial8250_release_dma(struct uart_8250_port *); #else static inline int serial8250_tx_dma(struct uart_8250_port *p) { return -1; } static inline int serial8250_rx_dma(struct uart_8250_port *p) { return -1; } static inline void serial8250_rx_dma_flush(struct uart_8250_port *p) { } static inline int serial8250_request_dma(struct uart_8250_port *p) { return -1; } static inline void serial8250_release_dma(struct uart_8250_port *p) { } #endif static inline int ns16550a_goto_highspeed(struct uart_8250_port *up) { unsigned char status; status = serial_in(up, 0x04); /* EXCR2 */ #define PRESL(x) ((x) & 0x30) if (PRESL(status) == 0x10) { /* already in high speed mode */ return 0; } else { status &= ~0xB0; /* Disable LOCK, mask out PRESL[01] */ status |= 0x10; /* 1.625 divisor for baud_base --> 921600 */ serial_out(up, 0x04, status); } return 1; } static inline int serial_index(struct uart_port *port) { return port->minor - 64; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/sunrpc/addr.h * * Various routines for copying and comparing sockaddrs and for * converting them to and from presentation format. */ #ifndef _LINUX_SUNRPC_ADDR_H #define _LINUX_SUNRPC_ADDR_H #include <linux/socket.h> #include <linux/in.h> #include <linux/in6.h> #include <net/ipv6.h> size_t rpc_ntop(const struct sockaddr *, char *, const size_t); size_t rpc_pton(struct net *, const char *, const size_t, struct sockaddr *, const size_t); char * rpc_sockaddr2uaddr(const struct sockaddr *, gfp_t); size_t rpc_uaddr2sockaddr(struct net *, const char *, const size_t, struct sockaddr *, const size_t); static inline unsigned short rpc_get_port(const struct sockaddr *sap) { switch (sap->sa_family) { case AF_INET: return ntohs(((struct sockaddr_in *)sap)->sin_port); case AF_INET6: return ntohs(((struct sockaddr_in6 *)sap)->sin6_port); } return 0; } static inline void rpc_set_port(struct sockaddr *sap, const unsigned short port) { switch (sap->sa_family) { case AF_INET: ((struct sockaddr_in *)sap)->sin_port = htons(port); break; case AF_INET6: ((struct sockaddr_in6 *)sap)->sin6_port = htons(port); break; } } #define IPV6_SCOPE_DELIMITER '%' #define IPV6_SCOPE_ID_LEN sizeof("%nnnnnnnnnn") static inline bool rpc_cmp_addr4(const struct sockaddr *sap1, const struct sockaddr *sap2) { const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1; const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2; return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; } static inline bool __rpc_copy_addr4(struct sockaddr *dst, const struct sockaddr *src) { const struct sockaddr_in *ssin = (struct sockaddr_in *) src; struct sockaddr_in *dsin = (struct sockaddr_in *) dst; dsin->sin_family = ssin->sin_family; dsin->sin_addr.s_addr = ssin->sin_addr.s_addr; return true; } #if IS_ENABLED(CONFIG_IPV6) static inline bool rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) { const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1; const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) return false; else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) return sin1->sin6_scope_id == sin2->sin6_scope_id; return true; } static inline bool __rpc_copy_addr6(struct sockaddr *dst, const struct sockaddr *src) { const struct sockaddr_in6 *ssin6 = (const struct sockaddr_in6 *) src; struct sockaddr_in6 *dsin6 = (struct sockaddr_in6 *) dst; dsin6->sin6_family = ssin6->sin6_family; dsin6->sin6_addr = ssin6->sin6_addr; dsin6->sin6_scope_id = ssin6->sin6_scope_id; return true; } #else /* !(IS_ENABLED(CONFIG_IPV6) */ static inline bool rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) { return false; } static inline bool __rpc_copy_addr6(struct sockaddr *dst, const struct sockaddr *src) { return false; } #endif /* !(IS_ENABLED(CONFIG_IPV6) */ /** * rpc_cmp_addr - compare the address portion of two sockaddrs. * @sap1: first sockaddr * @sap2: second sockaddr * * Just compares the family and address portion. Ignores port, but * compares the scope if it's a link-local address. * * Returns true if the addrs are equal, false if they aren't. */ static inline bool rpc_cmp_addr(const struct sockaddr *sap1, const struct sockaddr *sap2) { if (sap1->sa_family == sap2->sa_family) { switch (sap1->sa_family) { case AF_INET: return rpc_cmp_addr4(sap1, sap2); case AF_INET6: return rpc_cmp_addr6(sap1, sap2); } } return false; } /** * rpc_cmp_addr_port - compare the address and port number of two sockaddrs. * @sap1: first sockaddr * @sap2: second sockaddr */ static inline bool rpc_cmp_addr_port(const struct sockaddr *sap1, const struct sockaddr *sap2) { if (!rpc_cmp_addr(sap1, sap2)) return false; return rpc_get_port(sap1) == rpc_get_port(sap2); } /** * rpc_copy_addr - copy the address portion of one sockaddr to another * @dst: destination sockaddr * @src: source sockaddr * * Just copies the address portion and family. Ignores port, scope, etc. * Caller is responsible for making certain that dst is large enough to hold * the address in src. Returns true if address family is supported. Returns * false otherwise. */ static inline bool rpc_copy_addr(struct sockaddr *dst, const struct sockaddr *src) { switch (src->sa_family) { case AF_INET: return __rpc_copy_addr4(dst, src); case AF_INET6: return __rpc_copy_addr6(dst, src); } return false; } /** * rpc_get_scope_id - return scopeid for a given sockaddr * @sa: sockaddr to get scopeid from * * Returns the value of the sin6_scope_id for AF_INET6 addrs, or 0 if * not an AF_INET6 address. */ static inline u32 rpc_get_scope_id(const struct sockaddr *sa) { if (sa->sa_family != AF_INET6) return 0; return ((struct sockaddr_in6 *) sa)->sin6_scope_id; } #endif /* _LINUX_SUNRPC_ADDR_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM libata #if !defined(_TRACE_LIBATA_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_LIBATA_H #include <linux/ata.h> #include <linux/libata.h> #include <linux/tracepoint.h> #include <linux/trace_seq.h> #define ata_opcode_name(opcode) { opcode, #opcode } #define show_opcode_name(val) \ __print_symbolic(val, \ ata_opcode_name(ATA_CMD_DEV_RESET), \ ata_opcode_name(ATA_CMD_CHK_POWER), \ ata_opcode_name(ATA_CMD_STANDBY), \ ata_opcode_name(ATA_CMD_IDLE), \ ata_opcode_name(ATA_CMD_EDD), \ ata_opcode_name(ATA_CMD_DOWNLOAD_MICRO), \ ata_opcode_name(ATA_CMD_DOWNLOAD_MICRO_DMA), \ ata_opcode_name(ATA_CMD_NOP), \ ata_opcode_name(ATA_CMD_FLUSH), \ ata_opcode_name(ATA_CMD_FLUSH_EXT), \ ata_opcode_name(ATA_CMD_ID_ATA), \ ata_opcode_name(ATA_CMD_ID_ATAPI), \ ata_opcode_name(ATA_CMD_SERVICE), \ ata_opcode_name(ATA_CMD_READ), \ ata_opcode_name(ATA_CMD_READ_EXT), \ ata_opcode_name(ATA_CMD_READ_QUEUED), \ ata_opcode_name(ATA_CMD_READ_STREAM_EXT), \ ata_opcode_name(ATA_CMD_READ_STREAM_DMA_EXT), \ ata_opcode_name(ATA_CMD_WRITE), \ ata_opcode_name(ATA_CMD_WRITE_EXT), \ ata_opcode_name(ATA_CMD_WRITE_QUEUED), \ ata_opcode_name(ATA_CMD_WRITE_STREAM_EXT), \ ata_opcode_name(ATA_CMD_WRITE_STREAM_DMA_EXT), \ ata_opcode_name(ATA_CMD_WRITE_FUA_EXT), \ ata_opcode_name(ATA_CMD_WRITE_QUEUED_FUA_EXT), \ ata_opcode_name(ATA_CMD_FPDMA_READ), \ ata_opcode_name(ATA_CMD_FPDMA_WRITE), \ ata_opcode_name(ATA_CMD_NCQ_NON_DATA), \ ata_opcode_name(ATA_CMD_FPDMA_SEND), \ ata_opcode_name(ATA_CMD_FPDMA_RECV), \ ata_opcode_name(ATA_CMD_PIO_READ), \ ata_opcode_name(ATA_CMD_PIO_READ_EXT), \ ata_opcode_name(ATA_CMD_PIO_WRITE), \ ata_opcode_name(ATA_CMD_PIO_WRITE_EXT), \ ata_opcode_name(ATA_CMD_READ_MULTI), \ ata_opcode_name(ATA_CMD_READ_MULTI_EXT), \ ata_opcode_name(ATA_CMD_WRITE_MULTI), \ ata_opcode_name(ATA_CMD_WRITE_MULTI_EXT), \ ata_opcode_name(ATA_CMD_WRITE_MULTI_FUA_EXT), \ ata_opcode_name(ATA_CMD_SET_FEATURES), \ ata_opcode_name(ATA_CMD_SET_MULTI), \ ata_opcode_name(ATA_CMD_PACKET), \ ata_opcode_name(ATA_CMD_VERIFY), \ ata_opcode_name(ATA_CMD_VERIFY_EXT), \ ata_opcode_name(ATA_CMD_WRITE_UNCORR_EXT), \ ata_opcode_name(ATA_CMD_STANDBYNOW1), \ ata_opcode_name(ATA_CMD_IDLEIMMEDIATE), \ ata_opcode_name(ATA_CMD_SLEEP), \ ata_opcode_name(ATA_CMD_INIT_DEV_PARAMS), \ ata_opcode_name(ATA_CMD_READ_NATIVE_MAX), \ ata_opcode_name(ATA_CMD_READ_NATIVE_MAX_EXT), \ ata_opcode_name(ATA_CMD_SET_MAX), \ ata_opcode_name(ATA_CMD_SET_MAX_EXT), \ ata_opcode_name(ATA_CMD_READ_LOG_EXT), \ ata_opcode_name(ATA_CMD_WRITE_LOG_EXT), \ ata_opcode_name(ATA_CMD_READ_LOG_DMA_EXT), \ ata_opcode_name(ATA_CMD_WRITE_LOG_DMA_EXT), \ ata_opcode_name(ATA_CMD_TRUSTED_NONDATA), \ ata_opcode_name(ATA_CMD_TRUSTED_RCV), \ ata_opcode_name(ATA_CMD_TRUSTED_RCV_DMA), \ ata_opcode_name(ATA_CMD_TRUSTED_SND), \ ata_opcode_name(ATA_CMD_TRUSTED_SND_DMA), \ ata_opcode_name(ATA_CMD_PMP_READ), \ ata_opcode_name(ATA_CMD_PMP_READ_DMA), \ ata_opcode_name(ATA_CMD_PMP_WRITE), \ ata_opcode_name(ATA_CMD_PMP_WRITE_DMA), \ ata_opcode_name(ATA_CMD_CONF_OVERLAY), \ ata_opcode_name(ATA_CMD_SEC_SET_PASS), \ ata_opcode_name(ATA_CMD_SEC_UNLOCK), \ ata_opcode_name(ATA_CMD_SEC_ERASE_PREP), \ ata_opcode_name(ATA_CMD_SEC_ERASE_UNIT), \ ata_opcode_name(ATA_CMD_SEC_FREEZE_LOCK), \ ata_opcode_name(ATA_CMD_SEC_DISABLE_PASS), \ ata_opcode_name(ATA_CMD_CONFIG_STREAM), \ ata_opcode_name(ATA_CMD_SMART), \ ata_opcode_name(ATA_CMD_MEDIA_LOCK), \ ata_opcode_name(ATA_CMD_MEDIA_UNLOCK), \ ata_opcode_name(ATA_CMD_DSM), \ ata_opcode_name(ATA_CMD_CHK_MED_CRD_TYP), \ ata_opcode_name(ATA_CMD_CFA_REQ_EXT_ERR), \ ata_opcode_name(ATA_CMD_CFA_WRITE_NE), \ ata_opcode_name(ATA_CMD_CFA_TRANS_SECT), \ ata_opcode_name(ATA_CMD_CFA_ERASE), \ ata_opcode_name(ATA_CMD_CFA_WRITE_MULT_NE), \ ata_opcode_name(ATA_CMD_REQ_SENSE_DATA), \ ata_opcode_name(ATA_CMD_SANITIZE_DEVICE), \ ata_opcode_name(ATA_CMD_ZAC_MGMT_IN), \ ata_opcode_name(ATA_CMD_ZAC_MGMT_OUT), \ ata_opcode_name(ATA_CMD_RESTORE), \ ata_opcode_name(ATA_CMD_READ_LONG), \ ata_opcode_name(ATA_CMD_READ_LONG_ONCE), \ ata_opcode_name(ATA_CMD_WRITE_LONG), \ ata_opcode_name(ATA_CMD_WRITE_LONG_ONCE)) #define ata_error_name(result) { result, #result } #define show_error_name(val) \ __print_symbolic(val, \ ata_error_name(ATA_ICRC), \ ata_error_name(ATA_UNC), \ ata_error_name(ATA_MC), \ ata_error_name(ATA_IDNF), \ ata_error_name(ATA_MCR), \ ata_error_name(ATA_ABORTED), \ ata_error_name(ATA_TRK0NF), \ ata_error_name(ATA_AMNF)) #define ata_protocol_name(proto) { proto, #proto } #define show_protocol_name(val) \ __print_symbolic(val, \ ata_protocol_name(ATA_PROT_UNKNOWN), \ ata_protocol_name(ATA_PROT_NODATA), \ ata_protocol_name(ATA_PROT_PIO), \ ata_protocol_name(ATA_PROT_DMA), \ ata_protocol_name(ATA_PROT_NCQ), \ ata_protocol_name(ATA_PROT_NCQ_NODATA), \ ata_protocol_name(ATAPI_PROT_NODATA), \ ata_protocol_name(ATAPI_PROT_PIO), \ ata_protocol_name(ATAPI_PROT_DMA)) const char *libata_trace_parse_status(struct trace_seq*, unsigned char); #define __parse_status(s) libata_trace_parse_status(p, s) const char *libata_trace_parse_eh_action(struct trace_seq *, unsigned int); #define __parse_eh_action(a) libata_trace_parse_eh_action(p, a) const char *libata_trace_parse_eh_err_mask(struct trace_seq *, unsigned int); #define __parse_eh_err_mask(m) libata_trace_parse_eh_err_mask(p, m) const char *libata_trace_parse_qc_flags(struct trace_seq *, unsigned int); #define __parse_qc_flags(f) libata_trace_parse_qc_flags(p, f) const char *libata_trace_parse_subcmd(struct trace_seq *, unsigned char, unsigned char, unsigned char); #define __parse_subcmd(c,f,h) libata_trace_parse_subcmd(p, c, f, h) TRACE_EVENT(ata_qc_issue, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc), TP_STRUCT__entry( __field( unsigned int, ata_port ) __field( unsigned int, ata_dev ) __field( unsigned int, tag ) __field( unsigned char, cmd ) __field( unsigned char, dev ) __field( unsigned char, lbal ) __field( unsigned char, lbam ) __field( unsigned char, lbah ) __field( unsigned char, nsect ) __field( unsigned char, feature ) __field( unsigned char, hob_lbal ) __field( unsigned char, hob_lbam ) __field( unsigned char, hob_lbah ) __field( unsigned char, hob_nsect ) __field( unsigned char, hob_feature ) __field( unsigned char, ctl ) __field( unsigned char, proto ) __field( unsigned long, flags ) ), TP_fast_assign( __entry->ata_port = qc->ap->print_id; __entry->ata_dev = qc->dev->link->pmp + qc->dev->devno; __entry->tag = qc->tag; __entry->proto = qc->tf.protocol; __entry->cmd = qc->tf.command; __entry->dev = qc->tf.device; __entry->lbal = qc->tf.lbal; __entry->lbam = qc->tf.lbam; __entry->lbah = qc->tf.lbah; __entry->hob_lbal = qc->tf.hob_lbal; __entry->hob_lbam = qc->tf.hob_lbam; __entry->hob_lbah = qc->tf.hob_lbah; __entry->feature = qc->tf.feature; __entry->hob_feature = qc->tf.hob_feature; __entry->nsect = qc->tf.nsect; __entry->hob_nsect = qc->tf.hob_nsect; ), TP_printk("ata_port=%u ata_dev=%u tag=%d proto=%s cmd=%s%s " \ " tf=(%02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x)", __entry->ata_port, __entry->ata_dev, __entry->tag, show_protocol_name(__entry->proto), show_opcode_name(__entry->cmd), __parse_subcmd(__entry->cmd, __entry->feature, __entry->hob_nsect), __entry->cmd, __entry->feature, __entry->nsect, __entry->lbal, __entry->lbam, __entry->lbah, __entry->hob_feature, __entry->hob_nsect, __entry->hob_lbal, __entry->hob_lbam, __entry->hob_lbah, __entry->dev) ); DECLARE_EVENT_CLASS(ata_qc_complete_template, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc), TP_STRUCT__entry( __field( unsigned int, ata_port ) __field( unsigned int, ata_dev ) __field( unsigned int, tag ) __field( unsigned char, status ) __field( unsigned char, dev ) __field( unsigned char, lbal ) __field( unsigned char, lbam ) __field( unsigned char, lbah ) __field( unsigned char, nsect ) __field( unsigned char, error ) __field( unsigned char, hob_lbal ) __field( unsigned char, hob_lbam ) __field( unsigned char, hob_lbah ) __field( unsigned char, hob_nsect ) __field( unsigned char, hob_feature ) __field( unsigned char, ctl ) __field( unsigned long, flags ) ), TP_fast_assign( __entry->ata_port = qc->ap->print_id; __entry->ata_dev = qc->dev->link->pmp + qc->dev->devno; __entry->tag = qc->tag; __entry->status = qc->result_tf.command; __entry->dev = qc->result_tf.device; __entry->lbal = qc->result_tf.lbal; __entry->lbam = qc->result_tf.lbam; __entry->lbah = qc->result_tf.lbah; __entry->hob_lbal = qc->result_tf.hob_lbal; __entry->hob_lbam = qc->result_tf.hob_lbam; __entry->hob_lbah = qc->result_tf.hob_lbah; __entry->error = qc->result_tf.feature; __entry->hob_feature = qc->result_tf.hob_feature; __entry->nsect = qc->result_tf.nsect; __entry->hob_nsect = qc->result_tf.hob_nsect; ), TP_printk("ata_port=%u ata_dev=%u tag=%d flags=%s status=%s " \ " res=(%02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x)", __entry->ata_port, __entry->ata_dev, __entry->tag, __parse_qc_flags(__entry->flags), __parse_status(__entry->status), __entry->status, __entry->error, __entry->nsect, __entry->lbal, __entry->lbam, __entry->lbah, __entry->hob_feature, __entry->hob_nsect, __entry->hob_lbal, __entry->hob_lbam, __entry->hob_lbah, __entry->dev) ); DEFINE_EVENT(ata_qc_complete_template, ata_qc_complete_internal, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc)); DEFINE_EVENT(ata_qc_complete_template, ata_qc_complete_failed, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc)); DEFINE_EVENT(ata_qc_complete_template, ata_qc_complete_done, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc)); TRACE_EVENT(ata_eh_link_autopsy, TP_PROTO(struct ata_device *dev, unsigned int eh_action, unsigned int eh_err_mask), TP_ARGS(dev, eh_action, eh_err_mask), TP_STRUCT__entry( __field( unsigned int, ata_port ) __field( unsigned int, ata_dev ) __field( unsigned int, eh_action ) __field( unsigned int, eh_err_mask) ), TP_fast_assign( __entry->ata_port = dev->link->ap->print_id; __entry->ata_dev = dev->link->pmp + dev->devno; __entry->eh_action = eh_action; __entry->eh_err_mask = eh_err_mask; ), TP_printk("ata_port=%u ata_dev=%u eh_action=%s err_mask=%s", __entry->ata_port, __entry->ata_dev, __parse_eh_action(__entry->eh_action), __parse_eh_err_mask(__entry->eh_err_mask)) ); TRACE_EVENT(ata_eh_link_autopsy_qc, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc), TP_STRUCT__entry( __field( unsigned int, ata_port ) __field( unsigned int, ata_dev ) __field( unsigned int, tag ) __field( unsigned int, qc_flags ) __field( unsigned int, eh_err_mask) ), TP_fast_assign( __entry->ata_port = qc->ap->print_id; __entry->ata_dev = qc->dev->link->pmp + qc->dev->devno; __entry->tag = qc->tag; __entry->qc_flags = qc->flags; __entry->eh_err_mask = qc->err_mask; ), TP_printk("ata_port=%u ata_dev=%u tag=%d flags=%s err_mask=%s", __entry->ata_port, __entry->ata_dev, __entry->tag, __parse_qc_flags(__entry->qc_flags), __parse_eh_err_mask(__entry->eh_err_mask)) ); #endif /* _TRACE_LIBATA_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * 25-Jul-1998 Major changes to allow for ip chain table * * 3-Jan-2000 Named tables to allow packet selection for different uses. */ /* * Format of an IP6 firewall descriptor * * src, dst, src_mask, dst_mask are always stored in network byte order. * flags are stored in host byte order (of course). * Port numbers are stored in HOST byte order. */ #ifndef _UAPI_IP6_TABLES_H #define _UAPI_IP6_TABLES_H #include <linux/types.h> #include <linux/compiler.h> #include <linux/if.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/x_tables.h> #ifndef __KERNEL__ #define IP6T_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN #define IP6T_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN #define ip6t_match xt_match #define ip6t_target xt_target #define ip6t_table xt_table #define ip6t_get_revision xt_get_revision #define ip6t_entry_match xt_entry_match #define ip6t_entry_target xt_entry_target #define ip6t_standard_target xt_standard_target #define ip6t_error_target xt_error_target #define ip6t_counters xt_counters #define IP6T_CONTINUE XT_CONTINUE #define IP6T_RETURN XT_RETURN /* Pre-iptables-1.4.0 */ #include <linux/netfilter/xt_tcpudp.h> #define ip6t_tcp xt_tcp #define ip6t_udp xt_udp #define IP6T_TCP_INV_SRCPT XT_TCP_INV_SRCPT #define IP6T_TCP_INV_DSTPT XT_TCP_INV_DSTPT #define IP6T_TCP_INV_FLAGS XT_TCP_INV_FLAGS #define IP6T_TCP_INV_OPTION XT_TCP_INV_OPTION #define IP6T_TCP_INV_MASK XT_TCP_INV_MASK #define IP6T_UDP_INV_SRCPT XT_UDP_INV_SRCPT #define IP6T_UDP_INV_DSTPT XT_UDP_INV_DSTPT #define IP6T_UDP_INV_MASK XT_UDP_INV_MASK #define ip6t_counters_info xt_counters_info #define IP6T_STANDARD_TARGET XT_STANDARD_TARGET #define IP6T_ERROR_TARGET XT_ERROR_TARGET #define IP6T_MATCH_ITERATE(e, fn, args...) \ XT_MATCH_ITERATE(struct ip6t_entry, e, fn, ## args) #define IP6T_ENTRY_ITERATE(entries, size, fn, args...) \ XT_ENTRY_ITERATE(struct ip6t_entry, entries, size, fn, ## args) #endif /* Yes, Virginia, you have to zero the padding. */ struct ip6t_ip6 { /* Source and destination IP6 addr */ struct in6_addr src, dst; /* Mask for src and dest IP6 addr */ struct in6_addr smsk, dmsk; char iniface[IFNAMSIZ], outiface[IFNAMSIZ]; unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ]; /* Upper protocol number * - The allowed value is 0 (any) or protocol number of last parsable * header, which is 50 (ESP), 59 (No Next Header), 135 (MH), or * the non IPv6 extension headers. * - The protocol numbers of IPv6 extension headers except of ESP and * MH do not match any packets. * - You also need to set IP6T_FLAGS_PROTO to "flags" to check protocol. */ __u16 proto; /* TOS to match iff flags & IP6T_F_TOS */ __u8 tos; /* Flags word */ __u8 flags; /* Inverse flags */ __u8 invflags; }; /* Values for "flag" field in struct ip6t_ip6 (general ip6 structure). */ #define IP6T_F_PROTO 0x01 /* Set if rule cares about upper protocols */ #define IP6T_F_TOS 0x02 /* Match the TOS. */ #define IP6T_F_GOTO 0x04 /* Set if jump is a goto */ #define IP6T_F_MASK 0x07 /* All possible flag bits mask. */ /* Values for "inv" field in struct ip6t_ip6. */ #define IP6T_INV_VIA_IN 0x01 /* Invert the sense of IN IFACE. */ #define IP6T_INV_VIA_OUT 0x02 /* Invert the sense of OUT IFACE */ #define IP6T_INV_TOS 0x04 /* Invert the sense of TOS. */ #define IP6T_INV_SRCIP 0x08 /* Invert the sense of SRC IP. */ #define IP6T_INV_DSTIP 0x10 /* Invert the sense of DST OP. */ #define IP6T_INV_FRAG 0x20 /* Invert the sense of FRAG. */ #define IP6T_INV_PROTO XT_INV_PROTO #define IP6T_INV_MASK 0x7F /* All possible flag bits mask. */ /* This structure defines each of the firewall rules. Consists of 3 parts which are 1) general IP header stuff 2) match specific stuff 3) the target to perform if the rule matches */ struct ip6t_entry { struct ip6t_ip6 ipv6; /* Mark with fields that we care about. */ unsigned int nfcache; /* Size of ipt_entry + matches */ __u16 target_offset; /* Size of ipt_entry + matches + target */ __u16 next_offset; /* Back pointer */ unsigned int comefrom; /* Packet and byte counters. */ struct xt_counters counters; /* The matches (if any), then the target. */ unsigned char elems[0]; }; /* Standard entry */ struct ip6t_standard { struct ip6t_entry entry; struct xt_standard_target target; }; struct ip6t_error { struct ip6t_entry entry; struct xt_error_target target; }; #define IP6T_ENTRY_INIT(__size) \ { \ .target_offset = sizeof(struct ip6t_entry), \ .next_offset = (__size), \ } #define IP6T_STANDARD_INIT(__verdict) \ { \ .entry = IP6T_ENTRY_INIT(sizeof(struct ip6t_standard)), \ .target = XT_TARGET_INIT(XT_STANDARD_TARGET, \ sizeof(struct xt_standard_target)), \ .target.verdict = -(__verdict) - 1, \ } #define IP6T_ERROR_INIT \ { \ .entry = IP6T_ENTRY_INIT(sizeof(struct ip6t_error)), \ .target = XT_TARGET_INIT(XT_ERROR_TARGET, \ sizeof(struct xt_error_target)), \ .target.errorname = "ERROR", \ } /* * New IP firewall options for [gs]etsockopt at the RAW IP level. * Unlike BSD Linux inherits IP options so you don't have to use * a raw socket for this. Instead we check rights in the calls. * * ATTENTION: check linux/in6.h before adding new number here. */ #define IP6T_BASE_CTL 64 #define IP6T_SO_SET_REPLACE (IP6T_BASE_CTL) #define IP6T_SO_SET_ADD_COUNTERS (IP6T_BASE_CTL + 1) #define IP6T_SO_SET_MAX IP6T_SO_SET_ADD_COUNTERS #define IP6T_SO_GET_INFO (IP6T_BASE_CTL) #define IP6T_SO_GET_ENTRIES (IP6T_BASE_CTL + 1) #define IP6T_SO_GET_REVISION_MATCH (IP6T_BASE_CTL + 4) #define IP6T_SO_GET_REVISION_TARGET (IP6T_BASE_CTL + 5) #define IP6T_SO_GET_MAX IP6T_SO_GET_REVISION_TARGET /* obtain original address if REDIRECT'd connection */ #define IP6T_SO_ORIGINAL_DST 80 /* ICMP matching stuff */ struct ip6t_icmp { __u8 type; /* type to match */ __u8 code[2]; /* range of code */ __u8 invflags; /* Inverse flags */ }; /* Values for "inv" field for struct ipt_icmp. */ #define IP6T_ICMP_INV 0x01 /* Invert the sense of type/code test */ /* The argument to IP6T_SO_GET_INFO */ struct ip6t_getinfo { /* Which table: caller fills this in. */ char name[XT_TABLE_MAXNAMELEN]; /* Kernel fills these in. */ /* Which hook entry points are valid: bitmask */ unsigned int valid_hooks; /* Hook entry points: one per netfilter hook. */ unsigned int hook_entry[NF_INET_NUMHOOKS]; /* Underflow points. */ unsigned int underflow[NF_INET_NUMHOOKS]; /* Number of entries */ unsigned int num_entries; /* Size of entries. */ unsigned int size; }; /* The argument to IP6T_SO_SET_REPLACE. */ struct ip6t_replace { /* Which table. */ char name[XT_TABLE_MAXNAMELEN]; /* Which hook entry points are valid: bitmask. You can't change this. */ unsigned int valid_hooks; /* Number of entries */ unsigned int num_entries; /* Total size of new entries */ unsigned int size; /* Hook entry points. */ unsigned int hook_entry[NF_INET_NUMHOOKS]; /* Underflow points. */ unsigned int underflow[NF_INET_NUMHOOKS]; /* Information about old entries: */ /* Number of counters (must be equal to current number of entries). */ unsigned int num_counters; /* The old entries' counters. */ struct xt_counters __user *counters; /* The entries (hang off end: not really an array). */ struct ip6t_entry entries[0]; }; /* The argument to IP6T_SO_GET_ENTRIES. */ struct ip6t_get_entries { /* Which table: user fills this in. */ char name[XT_TABLE_MAXNAMELEN]; /* User fills this in: total entry size. */ unsigned int size; /* The entries. */ struct ip6t_entry entrytable[0]; }; /* Helper functions */ static __inline__ struct xt_entry_target * ip6t_get_target(struct ip6t_entry *e) { return (struct xt_entry_target *)((char *)e + e->target_offset); } /* * Main firewall chains definitions and global var's definitions. */ #endif /* _UAPI_IP6_TABLES_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 /* * Written by: Matthew Dobson, IBM Corporation * * Copyright (C) 2002, IBM Corp. * * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Send feedback to <colpatch@us.ibm.com> */ #ifndef _ASM_X86_TOPOLOGY_H #define _ASM_X86_TOPOLOGY_H /* * to preserve the visibility of NUMA_NO_NODE definition, * moved to there from here. May be used independent of * CONFIG_NUMA. */ #include <linux/numa.h> #ifdef CONFIG_NUMA #include <linux/cpumask.h> #include <asm/mpspec.h> #include <asm/percpu.h> /* Mappings between logical cpu number and node number */ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); #ifdef CONFIG_DEBUG_PER_CPU_MAPS /* * override generic percpu implementation of cpu_to_node */ extern int __cpu_to_node(int cpu); #define cpu_to_node __cpu_to_node extern int early_cpu_to_node(int cpu); #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ /* Same function but used if called before per_cpu areas are setup */ static inline int early_cpu_to_node(int cpu) { return early_per_cpu(x86_cpu_to_node_map, cpu); } #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ /* Mappings between node number and cpus on that node. */ extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; #ifdef CONFIG_DEBUG_PER_CPU_MAPS extern const struct cpumask *cpumask_of_node(int node); #else /* Returns a pointer to the cpumask of CPUs on Node 'node'. */ static inline const struct cpumask *cpumask_of_node(int node) { return node_to_cpumask_map[node]; } #endif extern void setup_node_to_cpumask_map(void); #define pcibus_to_node(bus) __pcibus_to_node(bus) extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) #else /* !CONFIG_NUMA */ static inline int numa_node_id(void) { return 0; } /* * indicate override: */ #define numa_node_id numa_node_id static inline int early_cpu_to_node(int cpu) { return 0; } static inline void setup_node_to_cpumask_map(void) { } #endif #include <asm-generic/topology.h> extern const struct cpumask *cpu_coregroup_mask(int cpu); #define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) #define topology_logical_die_id(cpu) (cpu_data(cpu).logical_die_id) #define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) extern unsigned int __max_die_per_package; #ifdef CONFIG_SMP #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) extern unsigned int __max_logical_packages; #define topology_max_packages() (__max_logical_packages) static inline int topology_max_die_per_package(void) { return __max_die_per_package; } extern int __max_smt_threads; static inline int topology_max_smt_threads(void) { return __max_smt_threads; } int topology_update_package_map(unsigned int apicid, unsigned int cpu); int topology_update_die_map(unsigned int dieid, unsigned int cpu); int topology_phys_to_logical_pkg(unsigned int pkg); int topology_phys_to_logical_die(unsigned int die, unsigned int cpu); bool topology_is_primary_thread(unsigned int cpu); bool topology_smt_supported(void); #else #define topology_max_packages() (1) static inline int topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } static inline int topology_update_die_map(unsigned int dieid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } static inline int topology_phys_to_logical_die(unsigned int die, unsigned int cpu) { return 0; } static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } static inline bool topology_smt_supported(void) { return false; } #endif static inline void arch_fix_phys_package_id(int num, u32 slot) { } struct pci_bus; int x86_pci_root_bus_node(int bus); void x86_pci_root_bus_resources(int bus, struct list_head *resources); extern bool x86_topology_update; #ifdef CONFIG_SCHED_MC_PRIO #include <asm/percpu.h> DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority); extern unsigned int __read_mostly sysctl_sched_itmt_enabled; /* Interface to set priority of a cpu */ void sched_set_itmt_core_prio(int prio, int core_cpu); /* Interface to notify scheduler that system supports ITMT */ int sched_set_itmt_support(void); /* Interface to notify scheduler that system revokes ITMT support */ void sched_clear_itmt_support(void); #else /* CONFIG_SCHED_MC_PRIO */ #define sysctl_sched_itmt_enabled 0 static inline void sched_set_itmt_core_prio(int prio, int core_cpu) { } static inline int sched_set_itmt_support(void) { return 0; } static inline void sched_clear_itmt_support(void) { } #endif /* CONFIG_SCHED_MC_PRIO */ #if defined(CONFIG_SMP) && defined(CONFIG_X86_64) #include <asm/cpufeature.h> DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key); #define arch_scale_freq_invariant() static_branch_likely(&arch_scale_freq_key) DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline long arch_scale_freq_capacity(int cpu) { return per_cpu(arch_freq_scale, cpu); } #define arch_scale_freq_capacity arch_scale_freq_capacity extern void arch_scale_freq_tick(void); #define arch_scale_freq_tick arch_scale_freq_tick extern void arch_set_max_freq_ratio(bool turbo_disabled); #else static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } #endif #endif /* _ASM_X86_TOPOLOGY_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* include/asm-generic/tlb.h * * Generic TLB shootdown code * * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * * Copyright 2011 Red Hat, Inc., Peter Zijlstra */ #ifndef _ASM_GENERIC__TLB_H #define _ASM_GENERIC__TLB_H #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/hugetlb_inline.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> /* * Blindly accessing user memory from NMI context can be dangerous * if we're in the middle of switching the current user task or switching * the loaded mm. */ #ifndef nmi_uaccess_okay # define nmi_uaccess_okay() true #endif #ifdef CONFIG_MMU /* * Generic MMU-gather implementation. * * The mmu_gather data structure is used by the mm code to implement the * correct and efficient ordering of freeing pages and TLB invalidations. * * This correct ordering is: * * 1) unhook page * 2) TLB invalidate page * 3) free page * * That is, we must never free a page before we have ensured there are no live * translations left to it. Otherwise it might be possible to observe (or * worse, change) the page content after it has been reused. * * The mmu_gather API consists of: * * - tlb_gather_mmu() / tlb_finish_mmu(); start and finish a mmu_gather * * Finish in particular will issue a (final) TLB invalidate and free * all (remaining) queued pages. * * - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA * * Defaults to flushing at tlb_end_vma() to reset the range; helps when * there's large holes between the VMAs. * * - tlb_remove_table() * * tlb_remove_table() is the basic primitive to free page-table directories * (__p*_free_tlb()). In it's most primitive form it is an alias for * tlb_remove_page() below, for when page directories are pages and have no * additional constraints. * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * * - tlb_remove_page() / __tlb_remove_page() * - tlb_remove_page_size() / __tlb_remove_page_size() * * __tlb_remove_page_size() is the basic primitive that queues a page for * freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a * boolean indicating if the queue is (now) full and a call to * tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * * - tlb_change_page_size() * * call before __tlb_remove_page*() to set the current page-size; implies a * possible tlb_flush_mmu() call. * * - tlb_flush_mmu() / tlb_flush_mmu_tlbonly() * * tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets * related state, like the range) * * tlb_flush_mmu() - in addition to the above TLB invalidate, also frees * whatever pages are still batched. * * - mmu_gather::fullmm * * A flag set by tlb_gather_mmu() to indicate we're going to free * the entire mm; this allows a number of optimizations. * * - We can ignore tlb_{start,end}_vma(); because we don't * care about ranges. Everything will be shot down. * * - (RISC) architectures that use ASIDs can cycle to a new ASID * and delay the invalidation until ASID space runs out. * * - mmu_gather::need_flush_all * * A flag that can be set by the arch code if it wants to force * flush the entire TLB irrespective of the range. For instance * x86-PAE needs this when changing top-level entries. * * And allows the architecture to provide and implement tlb_flush(): * * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make * use of: * * - mmu_gather::start / mmu_gather::end * * which provides the range that needs to be flushed to cover the pages to * be freed. * * - mmu_gather::freed_tables * * set when we freed page table pages * * - tlb_get_unmap_shift() / tlb_get_unmap_size() * * returns the smallest TLB entry size unmapped in this range. * * If an architecture does not provide tlb_flush() a default implementation * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is * specified, in which case we'll default to flush_tlb_mm(). * * Additionally there are a few opt-in features: * * MMU_GATHER_PAGE_SIZE * * This ensures we call tlb_flush() every time tlb_change_page_size() actually * changes the size and provides mmu_gather::page_size to tlb_flush(). * * This might be useful if your architecture has size specific TLB * invalidation instructions. * * MMU_GATHER_TABLE_FREE * * This provides tlb_remove_table(), to be used instead of tlb_remove_page() * for page directores (__p*_free_tlb()). * * Useful if your architecture has non-page page directories. * * When used, an architecture is expected to provide __tlb_remove_table() * which does the actual freeing of these pages. * * MMU_GATHER_RCU_TABLE_FREE * * Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see * comment below). * * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * * MMU_GATHER_NO_RANGE * * Use this if your architecture lacks an efficient flush_tlb_range(). * * MMU_GATHER_NO_GATHER * * If the option is set the mmu_gather will not track individual pages for * delayed page free anymore. A platform that enables the option needs to * provide its own implementation of the __tlb_remove_page_size() function to * free pages. * * This is useful if your architecture already flushes TLB entries in the * various ptep_get_and_clear() functions. */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch { #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE struct rcu_head rcu; #endif unsigned int nr; void *tables[0]; }; #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */ /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ #define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page)) #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * This allows an architecture that does not use the linux page-tables for * hardware to skip the TLBI when freeing page tables. */ #ifndef tlb_needs_table_invalidate #define tlb_needs_table_invalidate() (true) #endif #else #ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ #ifndef CONFIG_MMU_GATHER_NO_GATHER /* * If we can't allocate a page to make a big batch of page pointers * to work on, then just handle a few from the on-stack structure. */ #define MMU_GATHER_BUNDLE 8 struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; struct page *pages[0]; }; #define MAX_GATHER_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *)) /* * Limit the maximum number of mmu_gather batches to reduce a risk of soft * lockups for non-preemptible kernels on huge machines when a lot of memory * is zapped during unmapping. * 10K pages freed at once should be safe even without a preemption point. */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); #endif /* * struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { struct mm_struct *mm; #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch *batch; #endif unsigned long start; unsigned long end; /* * we are in the middle of an operation to clear * a full mm and can make some optimizations */ unsigned int fullmm : 1; /* * we have performed an operation which * requires a complete flush of the tlb */ unsigned int need_flush_all : 1; /* * we have removed page directories */ unsigned int freed_tables : 1; /* * at which levels have we cleared entries? */ unsigned int cleared_ptes : 1; unsigned int cleared_pmds : 1; unsigned int cleared_puds : 1; unsigned int cleared_p4ds : 1; /* * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma */ unsigned int vma_exec : 1; unsigned int vma_huge : 1; unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER struct mmu_gather_batch *active; struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; #ifdef CONFIG_MMU_GATHER_PAGE_SIZE unsigned int page_size; #endif #endif }; void tlb_flush_mmu(struct mmu_gather *tlb); static inline void __tlb_adjust_range(struct mmu_gather *tlb, unsigned long address, unsigned int range_size) { tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + range_size); } static inline void __tlb_reset_range(struct mmu_gather *tlb) { if (tlb->fullmm) { tlb->start = tlb->end = ~0; } else { tlb->start = TASK_SIZE; tlb->end = 0; } tlb->freed_tables = 0; tlb->cleared_ptes = 0; tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an * intermediate flush. */ } #ifdef CONFIG_MMU_GATHER_NO_RANGE #if defined(tlb_flush) || defined(tlb_start_vma) || defined(tlb_end_vma) #error MMU_GATHER_NO_RANGE relies on default tlb_flush(), tlb_start_vma() and tlb_end_vma() #endif /* * When an architecture does not have efficient means of range flushing TLBs * there is no point in doing intermediate flushes on tlb_end_vma() to keep the * range small. We equally don't have to worry about page granularity or other * things. * * All we need to do is issue a full flush for any !0 range. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->end) flush_tlb_mm(tlb->mm); } static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #define tlb_end_vma tlb_end_vma static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #else /* CONFIG_MMU_GATHER_NO_RANGE */ #ifndef tlb_flush #if defined(tlb_start_vma) || defined(tlb_end_vma) #error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma() #endif /* * When an architecture does not provide its own tlb_flush() implementation * but does have a reasonably efficient flush_vma_range() implementation * use that. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->fullmm || tlb->need_flush_all) { flush_tlb_mm(tlb->mm); } else if (tlb->end) { struct vm_area_struct vma = { .vm_mm = tlb->mm, .vm_flags = (tlb->vma_exec ? VM_EXEC : 0) | (tlb->vma_huge ? VM_HUGETLB : 0), }; flush_tlb_range(&vma, tlb->start, tlb->end); } } static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { /* * flush_tlb_range() implementations that look at VM_HUGETLB (tile, * mips-4k) flush only large pages. * * flush_tlb_range() implementations that flush I-TLB also flush D-TLB * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing * range. * * We rely on tlb_end_vma() to issue a flush, such that when we reset * these values the batch is empty. */ tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); } #else static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #endif #endif /* CONFIG_MMU_GATHER_NO_RANGE */ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { /* * Anything calling __tlb_adjust_range() also sets at least one of * these bits. */ if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || tlb->cleared_puds || tlb->cleared_p4ds)) return; tlb_flush(tlb); mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); __tlb_reset_range(tlb); } static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { if (__tlb_remove_page_size(tlb, page, page_size)) tlb_flush_mmu(tlb); } static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return __tlb_remove_page_size(tlb, page, PAGE_SIZE); } /* tlb_remove_page * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when * required. */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return tlb_remove_page_size(tlb, page, PAGE_SIZE); } static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { #ifdef CONFIG_MMU_GATHER_PAGE_SIZE if (tlb->page_size && tlb->page_size != page_size) { if (!tlb->fullmm && !tlb->need_flush_all) tlb_flush_mmu(tlb); } tlb->page_size = page_size; #endif } static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb) { if (tlb->cleared_ptes) return PAGE_SHIFT; if (tlb->cleared_pmds) return PMD_SHIFT; if (tlb->cleared_puds) return PUD_SHIFT; if (tlb->cleared_p4ds) return P4D_SHIFT; return PAGE_SHIFT; } static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb) { return 1UL << tlb_get_unmap_shift(tlb); } /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, * the vmas are adjusted to only cover the region to be torn down. */ #ifndef tlb_start_vma static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) return; tlb_update_vma_flags(tlb, vma); flush_cache_range(vma, vma->vm_start, vma->vm_end); } #endif #ifndef tlb_end_vma static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) return; /* * Do a TLB flush and reset the range at VMA boundaries; this avoids * the ranges growing with the unused space between consecutive VMAs, * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on * this. */ tlb_flush_mmu_tlbonly(tlb); } #endif /* * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, * and set corresponding cleared_*. */ static inline void tlb_flush_pte_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_ptes = 1; } static inline void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_pmds = 1; } static inline void tlb_flush_pud_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_puds = 1; } static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_p4ds = 1; } #ifndef __tlb_remove_tlb_entry #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) #endif /** * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. * * Record the fact that pte's were really unmapped by updating the range, * so we can later optimise away the tlb invalidate. This helps when * userspace is unmapping already-unmapped pages, which happens quite a lot. */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ tlb_flush_pte_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ if (_sz == PMD_SIZE) \ tlb_flush_pmd_range(tlb, address, _sz); \ else if (_sz == PUD_SIZE) \ tlb_flush_pud_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation * This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pmd_tlb_entry #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) #endif #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ do { \ tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE); \ __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) /** * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb * invalidation. This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pud_tlb_entry #define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0) #endif #define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ do { \ tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE); \ __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ } while (0) /* * For things like page tables caches (ie caching addresses "inside" the * page tables, like x86 does), for legacy reasons, flushing an * individual page had better flush the page table caches behind it. This * is definitely how x86 works, for example. And if you have an * architected non-legacy page table cache (which I'm not aware of * anybody actually doing), you're going to have some architecturally * explicit flushing for that, likely *separate* from a regular TLB entry * flush, and thus you'd need more than just some range expansion.. * * So if we ever find an architecture * that would want something that odd, I think it is up to that * architecture to do its own odd thing, not cause pain for others * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com * * For now w.r.t page table cache, mark the range_size as PAGE_SIZE */ #ifndef pte_free_tlb #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb_flush_pmd_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #endif #ifndef pmd_free_tlb #define pmd_free_tlb(tlb, pmdp, address) \ do { \ tlb_flush_pud_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #endif #ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ tlb_flush_p4d_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef p4d_free_tlb #define p4d_free_tlb(tlb, pudp, address) \ do { \ __tlb_adjust_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __p4d_free_tlb(tlb, pudp, address); \ } while (0) #endif #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __IEEE802154_CORE_H #define __IEEE802154_CORE_H #include <net/cfg802154.h> struct cfg802154_registered_device { const struct cfg802154_ops *ops; struct list_head list; /* wpan_phy index, internal only */ int wpan_phy_idx; /* also protected by devlist_mtx */ int opencount; wait_queue_head_t dev_wait; /* protected by RTNL only */ int num_running_ifaces; /* associated wpan interfaces, protected by rtnl or RCU */ struct list_head wpan_dev_list; int devlist_generation, wpan_dev_id; /* must be last because of the way we do wpan_phy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wpan_phy wpan_phy __aligned(NETDEV_ALIGN); }; static inline struct cfg802154_registered_device * wpan_phy_to_rdev(struct wpan_phy *wpan_phy) { BUG_ON(!wpan_phy); return container_of(wpan_phy, struct cfg802154_registered_device, wpan_phy); } extern struct list_head cfg802154_rdev_list; extern int cfg802154_rdev_list_generation; int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, struct net *net); /* free object */ void cfg802154_dev_free(struct cfg802154_registered_device *rdev); struct cfg802154_registered_device * cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx); struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx); #endif /* __IEEE802154_CORE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 // SPDX-License-Identifier: GPL-2.0-or-later /* Common capabilities, needed by capability.o. */ #include <linux/capability.h> #include <linux/audit.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/lsm_hooks.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/ptrace.h> #include <linux/xattr.h> #include <linux/hugetlb.h> #include <linux/mount.h> #include <linux/sched.h> #include <linux/prctl.h> #include <linux/securebits.h> #include <linux/user_namespace.h> #include <linux/binfmts.h> #include <linux/personality.h> /* * If a non-root user executes a setuid-root binary in * !secure(SECURE_NOROOT) mode, then we raise capabilities. * However if fE is also set, then the intent is for only * the file capabilities to be applied, and the setuid-root * bit is left on either to change the uid (plausible) or * to get full privilege on a kernel without file capabilities * support. So in that case we do not raise capabilities. * * Warn if that happens, once per boot. */ static void warn_setuid_and_fcaps_mixed(const char *fname) { static int warned; if (!warned) { printk(KERN_INFO "warning: `%s' has both setuid-root and" " effective capabilities. Therefore not raising all" " capabilities.\n", fname); warned = 1; } } /** * cap_capable - Determine whether a task has a particular effective capability * @cred: The credentials to use * @ns: The user namespace in which we need the capability * @cap: The capability to check for * @opts: Bitmask of options defined in include/linux/security.h * * Determine whether the nominated task has the specified capability amongst * its effective set, returning 0 if it does, -ve if it does not. * * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable() * and has_capability() functions. That is, it has the reverse semantics: * cap_has_capability() returns 0 when a task has a capability, but the * kernel's capable() and has_capability() returns 1 for this case. */ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, int cap, unsigned int opts) { struct user_namespace *ns = targ_ns; /* See if cred has the capability in the target user namespace * by examining the target user namespace and all of the target * user namespace's parents. */ for (;;) { /* Do we have the necessary capabilities? */ if (ns == cred->user_ns) return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; /* * If we're already at a lower level than we're looking for, * we're done searching. */ if (ns->level <= cred->user_ns->level) return -EPERM; /* * The owner of the user namespace in the parent of the * user namespace has all caps. */ if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) return 0; /* * If you have a capability in a parent user ns, then you have * it over all children user namespaces as well. */ ns = ns->parent; } /* We never get here */ } /** * cap_settime - Determine whether the current process may set the system clock * @ts: The time to set * @tz: The timezone to set * * Determine whether the current process may set the system clock and timezone * information, returning 0 if permission granted, -ve if denied. */ int cap_settime(const struct timespec64 *ts, const struct timezone *tz) { if (!capable(CAP_SYS_TIME)) return -EPERM; return 0; } /** * cap_ptrace_access_check - Determine whether the current process may access * another * @child: The process to be accessed * @mode: The mode of attachment. * * If we are in the same or an ancestor user_ns and have all the target * task's capabilities, then ptrace access is allowed. * If we have the ptrace capability to the target user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether a process may access another, returning 0 if permission * granted, -ve if denied. */ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; const struct cred *cred, *child_cred; const kernel_cap_t *caller_caps; rcu_read_lock(); cred = current_cred(); child_cred = __task_cred(child); if (mode & PTRACE_MODE_FSCREDS) caller_caps = &cred->cap_effective; else caller_caps = &cred->cap_permitted; if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, *caller_caps)) goto out; if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_ptrace_traceme - Determine whether another process may trace the current * @parent: The task proposed to be the tracer * * If parent is in the same or an ancestor user_ns and has all current's * capabilities, then ptrace access is allowed. * If parent has the ptrace capability to current's user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether the nominated task is permitted to trace the current * process, returning 0 if permission is granted, -ve if denied. */ int cap_ptrace_traceme(struct task_struct *parent) { int ret = 0; const struct cred *cred, *child_cred; rcu_read_lock(); cred = __task_cred(parent); child_cred = current_cred(); if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) goto out; if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_capget - Retrieve a task's capability sets * @target: The task from which to retrieve the capability sets * @effective: The place to record the effective set * @inheritable: The place to record the inheritable set * @permitted: The place to record the permitted set * * This function retrieves the capabilities of the nominated task and returns * them to the caller. */ int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { const struct cred *cred; /* Derived from kernel/capability.c:sys_capget. */ rcu_read_lock(); cred = __task_cred(target); *effective = cred->cap_effective; *inheritable = cred->cap_inheritable; *permitted = cred->cap_permitted; rcu_read_unlock(); return 0; } /* * Determine whether the inheritable capabilities are limited to the old * permitted set. Returns 1 if they are limited, 0 if they are not. */ static inline int cap_inh_is_capped(void) { /* they are so limited unless the current task has the CAP_SETPCAP * capability */ if (cap_capable(current_cred(), current_cred()->user_ns, CAP_SETPCAP, CAP_OPT_NONE) == 0) return 0; return 1; } /** * cap_capset - Validate and apply proposed changes to current's capabilities * @new: The proposed new credentials; alterations should be made here * @old: The current task's current credentials * @effective: A pointer to the proposed new effective capabilities set * @inheritable: A pointer to the proposed new inheritable capabilities set * @permitted: A pointer to the proposed new permitted capabilities set * * This function validates and applies a proposed mass change to the current * process's capability sets. The changes are made to the proposed new * credentials, and assuming no error, will be committed by the caller of LSM. */ int cap_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { if (cap_inh_is_capped() && !cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_permitted))) /* incapable of using this inheritable set */ return -EPERM; if (!cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_bset))) /* no new pI capabilities outside bounding set */ return -EPERM; /* verify restrictions on target's new Permitted set */ if (!cap_issubset(*permitted, old->cap_permitted)) return -EPERM; /* verify the _new_Effective_ is a subset of the _new_Permitted_ */ if (!cap_issubset(*effective, *permitted)) return -EPERM; new->cap_effective = *effective; new->cap_inheritable = *inheritable; new->cap_permitted = *permitted; /* * Mask off ambient bits that are no longer both permitted and * inheritable. */ new->cap_ambient = cap_intersect(new->cap_ambient, cap_intersect(*permitted, *inheritable)); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EINVAL; return 0; } /** * cap_inode_need_killpriv - Determine if inode change affects privileges * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV * * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV * affects the security markings on that inode, and if it is, should * inode_killpriv() be invoked or the change rejected. * * Returns 1 if security.capability has a value, meaning inode_killpriv() * is required, 0 otherwise, meaning inode_killpriv() is not required. */ int cap_inode_need_killpriv(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); int error; error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0); return error > 0; } /** * cap_inode_killpriv - Erase the security markings on an inode * @dentry: The inode/dentry to alter * * Erase the privilege-enhancing security markings on an inode. * * Returns 0 if successful, -ve on error. */ int cap_inode_killpriv(struct dentry *dentry) { int error; error = __vfs_removexattr(dentry, XATTR_NAME_CAPS); if (error == -EOPNOTSUPP) error = 0; return error; } static bool rootid_owns_currentns(kuid_t kroot) { struct user_namespace *ns; if (!uid_valid(kroot)) return false; for (ns = current_user_ns(); ; ns = ns->parent) { if (from_kuid(ns, kroot) == 0) return true; if (ns == &init_user_ns) break; } return false; } static __u32 sansflags(__u32 m) { return m & ~VFS_CAP_FLAGS_EFFECTIVE; } static bool is_v2header(size_t size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_2) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2; } static bool is_v3header(size_t size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_3) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3; } /* * getsecurity: We are called for security.* before any attempt to read the * xattr from the inode itself. * * This gives us a chance to read the on-disk value and convert it. If we * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler. * * Note we are not called by vfs_getxattr_alloc(), but that is only called * by the integrity subsystem, which really wants the unconverted values - * so that's good. */ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc) { int size, ret; kuid_t kroot; u32 nsmagic, magic; uid_t root, mappedroot; char *tmpbuf = NULL; struct vfs_cap_data *cap; struct vfs_ns_cap_data *nscap = NULL; struct dentry *dentry; struct user_namespace *fs_ns; if (strcmp(name, "capability") != 0) return -EOPNOTSUPP; dentry = d_find_any_alias(inode); if (!dentry) return -EINVAL; size = sizeof(struct vfs_ns_cap_data); ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS, &tmpbuf, size, GFP_NOFS); dput(dentry); if (ret < 0 || !tmpbuf) return ret; fs_ns = inode->i_sb->s_user_ns; cap = (struct vfs_cap_data *) tmpbuf; if (is_v2header((size_t) ret, cap)) { root = 0; } else if (is_v3header((size_t) ret, cap)) { nscap = (struct vfs_ns_cap_data *) tmpbuf; root = le32_to_cpu(nscap->rootid); } else { size = -EINVAL; goto out_free; } kroot = make_kuid(fs_ns, root); /* If the root kuid maps to a valid uid in current ns, then return * this as a nscap. */ mappedroot = from_kuid(current_user_ns(), kroot); if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { size = sizeof(struct vfs_ns_cap_data); if (alloc) { if (!nscap) { /* v2 -> v3 conversion */ nscap = kzalloc(size, GFP_ATOMIC); if (!nscap) { size = -ENOMEM; goto out_free; } nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); nscap->magic_etc = cpu_to_le32(nsmagic); } else { /* use allocated v3 buffer */ tmpbuf = NULL; } nscap->rootid = cpu_to_le32(mappedroot); *buffer = nscap; } goto out_free; } if (!rootid_owns_currentns(kroot)) { size = -EOVERFLOW; goto out_free; } /* This comes from a parent namespace. Return as a v2 capability */ size = sizeof(struct vfs_cap_data); if (alloc) { if (nscap) { /* v3 -> v2 conversion */ cap = kzalloc(size, GFP_ATOMIC); if (!cap) { size = -ENOMEM; goto out_free; } magic = VFS_CAP_REVISION_2; nsmagic = le32_to_cpu(nscap->magic_etc); if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) magic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); cap->magic_etc = cpu_to_le32(magic); } else { /* use unconverted v2 */ tmpbuf = NULL; } *buffer = cap; } out_free: kfree(tmpbuf); return size; } static kuid_t rootid_from_xattr(const void *value, size_t size, struct user_namespace *task_ns) { const struct vfs_ns_cap_data *nscap = value; uid_t rootid = 0; if (size == XATTR_CAPS_SZ_3) rootid = le32_to_cpu(nscap->rootid); return make_kuid(task_ns, rootid); } static bool validheader(size_t size, const struct vfs_cap_data *cap) { return is_v2header(size, cap) || is_v3header(size, cap); } /* * User requested a write of security.capability. If needed, update the * xattr to change from v2 to v3, or to fixup the v3 rootid. * * If all is ok, we return the new size, on error return < 0. */ int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size) { struct vfs_ns_cap_data *nscap; uid_t nsrootid; const struct vfs_cap_data *cap = *ivalue; __u32 magic, nsmagic; struct inode *inode = d_backing_inode(dentry); struct user_namespace *task_ns = current_user_ns(), *fs_ns = inode->i_sb->s_user_ns; kuid_t rootid; size_t newsize; if (!*ivalue) return -EINVAL; if (!validheader(size, cap)) return -EINVAL; if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP)) return -EPERM; if (size == XATTR_CAPS_SZ_2) if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP)) /* user is privileged, just write the v2 */ return size; rootid = rootid_from_xattr(*ivalue, size, task_ns); if (!uid_valid(rootid)) return -EINVAL; nsrootid = from_kuid(fs_ns, rootid); if (nsrootid == -1) return -EINVAL; newsize = sizeof(struct vfs_ns_cap_data); nscap = kmalloc(newsize, GFP_ATOMIC); if (!nscap) return -ENOMEM; nscap->rootid = cpu_to_le32(nsrootid); nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; nscap->magic_etc = cpu_to_le32(nsmagic); memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); kvfree(*ivalue); *ivalue = nscap; return newsize; } /* * Calculate the new process capability sets from the capability sets attached * to a file. */ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, struct linux_binprm *bprm, bool *effective, bool *has_fcap) { struct cred *new = bprm->cred; unsigned i; int ret = 0; if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE) *effective = true; if (caps->magic_etc & VFS_CAP_REVISION_MASK) *has_fcap = true; CAP_FOR_EACH_U32(i) { __u32 permitted = caps->permitted.cap[i]; __u32 inheritable = caps->inheritable.cap[i]; /* * pP' = (X & fP) | (pI & fI) * The addition of pA' is handled later. */ new->cap_permitted.cap[i] = (new->cap_bset.cap[i] & permitted) | (new->cap_inheritable.cap[i] & inheritable); if (permitted & ~new->cap_permitted.cap[i]) /* insufficient to execute correctly */ ret = -EPERM; } /* * For legacy apps, with no internal support for recognizing they * do not have enough capabilities, we return an error if they are * missing some "forced" (aka file-permitted) capabilities. */ return *effective ? ret : 0; } /* * Extract the on-exec-apply capability sets for an executable file. */ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) { struct inode *inode = d_backing_inode(dentry); __u32 magic_etc; unsigned tocopy, i; int size; struct vfs_ns_cap_data data, *nscaps = &data; struct vfs_cap_data *caps = (struct vfs_cap_data *) &data; kuid_t rootkuid; struct user_namespace *fs_ns; memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data)); if (!inode) return -ENODATA; fs_ns = inode->i_sb->s_user_ns; size = __vfs_getxattr((struct dentry *)dentry, inode, XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ); if (size == -ENODATA || size == -EOPNOTSUPP) /* no data, that's ok */ return -ENODATA; if (size < 0) return size; if (size < sizeof(magic_etc)) return -EINVAL; cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc); rootkuid = make_kuid(fs_ns, 0); switch (magic_etc & VFS_CAP_REVISION_MASK) { case VFS_CAP_REVISION_1: if (size != XATTR_CAPS_SZ_1) return -EINVAL; tocopy = VFS_CAP_U32_1; break; case VFS_CAP_REVISION_2: if (size != XATTR_CAPS_SZ_2) return -EINVAL; tocopy = VFS_CAP_U32_2; break; case VFS_CAP_REVISION_3: if (size != XATTR_CAPS_SZ_3) return -EINVAL; tocopy = VFS_CAP_U32_3; rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid)); break; default: return -EINVAL; } /* Limit the caps to the mounter of the filesystem * or the more limited uid specified in the xattr. */ if (!rootid_owns_currentns(rootkuid)) return -ENODATA; CAP_FOR_EACH_U32(i) { if (i >= tocopy) break; cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted); cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable); } cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; cpu_caps->rootid = rootkuid; return 0; } /* * Attempt to get the on-exec apply capability sets for an executable file from * its xattrs and, if present, apply them to the proposed credentials being * constructed by execve(). */ static int get_file_caps(struct linux_binprm *bprm, struct file *file, bool *effective, bool *has_fcap) { int rc = 0; struct cpu_vfs_cap_data vcaps; cap_clear(bprm->cred->cap_permitted); if (!file_caps_enabled) return 0; if (!mnt_may_suid(file->f_path.mnt)) return 0; /* * This check is redundant with mnt_may_suid() but is kept to make * explicit that capability bits are limited to s_user_ns and its * descendants. */ if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns)) return 0; rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) printk(KERN_NOTICE "Invalid argument reading file caps for %s\n", bprm->filename); else if (rc == -ENODATA) rc = 0; goto out; } rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap); out: if (rc) cap_clear(bprm->cred->cap_permitted); return rc; } static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); } static inline bool __is_real(kuid_t uid, struct cred *cred) { return uid_eq(cred->uid, uid); } static inline bool __is_eff(kuid_t uid, struct cred *cred) { return uid_eq(cred->euid, uid); } static inline bool __is_suid(kuid_t uid, struct cred *cred) { return !__is_real(uid, cred) && __is_eff(uid, cred); } /* * handle_privileged_root - Handle case of privileged root * @bprm: The execution parameters, including the proposed creds * @has_fcap: Are any file capabilities set? * @effective: Do we have effective root privilege? * @root_uid: This namespace' root UID WRT initial USER namespace * * Handle the case where root is privileged and hasn't been neutered by * SECURE_NOROOT. If file capabilities are set, they won't be combined with * set UID root and nothing is changed. If we are root, cap_permitted is * updated. If we have become set UID root, the effective bit is set. */ static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, bool *effective, kuid_t root_uid) { const struct cred *old = current_cred(); struct cred *new = bprm->cred; if (!root_privileged()) return; /* * If the legacy file capability is set, then don't set privs * for a setuid root binary run by a non-root user. Do set it * for a root user just to cause least surprise to an admin. */ if (has_fcap && __is_suid(root_uid, new)) { warn_setuid_and_fcaps_mixed(bprm->filename); return; } /* * To support inheritance of root-permissions and suid-root * executables under compatibility mode, we override the * capability sets for the file. */ if (__is_eff(root_uid, new) || __is_real(root_uid, new)) { /* pP' = (cap_bset & ~0) | (pI & ~0) */ new->cap_permitted = cap_combine(old->cap_bset, old->cap_inheritable); } /* * If only the real uid is 0, we do not set the effective bit. */ if (__is_eff(root_uid, new)) *effective = true; } #define __cap_gained(field, target, source) \ !cap_issubset(target->cap_##field, source->cap_##field) #define __cap_grew(target, source, cred) \ !cap_issubset(cred->cap_##target, cred->cap_##source) #define __cap_full(field, cred) \ cap_issubset(CAP_FULL_SET, cred->cap_##field) static inline bool __is_setuid(struct cred *new, const struct cred *old) { return !uid_eq(new->euid, old->uid); } static inline bool __is_setgid(struct cred *new, const struct cred *old) { return !gid_eq(new->egid, old->gid); } /* * 1) Audit candidate if current->cap_effective is set * * We do not bother to audit if 3 things are true: * 1) cap_effective has all caps * 2) we became root *OR* are were already root * 3) root is supposed to have all caps (SECURE_NOROOT) * Since this is just a normal root execing a process. * * Number 1 above might fail if you don't have a full bset, but I think * that is interesting information to audit. * * A number of other conditions require logging: * 2) something prevented setuid root getting all caps * 3) non-setuid root gets fcaps * 4) non-setuid root gets ambient */ static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old, kuid_t root, bool has_fcap) { bool ret = false; if ((__cap_grew(effective, ambient, new) && !(__cap_full(effective, new) && (__is_eff(root, new) || __is_real(root, new)) && root_privileged())) || (root_privileged() && __is_suid(root, new) && !__cap_full(effective, new)) || (!__is_setuid(new, old) && ((has_fcap && __cap_gained(permitted, new, old)) || __cap_gained(ambient, new, old)))) ret = true; return ret; } /** * cap_bprm_creds_from_file - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds * @file: The file to pull the credentials from * * Set up the proposed credentials for a new execution context being * constructed by execve(). The proposed creds in @bprm->cred is altered, * which won't take effect immediately. Returns 0 if successful, -ve on error. */ int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file) { /* Process setpcap binaries and capabilities for uid 0 */ const struct cred *old = current_cred(); struct cred *new = bprm->cred; bool effective = false, has_fcap = false, is_setid; int ret; kuid_t root_uid; if (WARN_ON(!cap_ambient_invariant_ok(old))) return -EPERM; ret = get_file_caps(bprm, file, &effective, &has_fcap); if (ret < 0) return ret; root_uid = make_kuid(new->user_ns, 0); handle_privileged_root(bprm, has_fcap, &effective, root_uid); /* if we have fs caps, clear dangerous personality flags */ if (__cap_gained(permitted, new, old)) bprm->per_clear |= PER_CLEAR_ON_SETID; /* Don't let someone trace a set[ug]id/setpcap binary with the revised * credentials unless they have the appropriate permit. * * In addition, if NO_NEW_PRIVS, then ensure we get no new privs. */ is_setid = __is_setuid(new, old) || __is_setgid(new, old); if ((is_setid || __cap_gained(permitted, new, old)) && ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) || !ptracer_capable(current, new->user_ns))) { /* downgrade; they get no more than they had, and maybe less */ if (!ns_capable(new->user_ns, CAP_SETUID) || (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) { new->euid = new->uid; new->egid = new->gid; } new->cap_permitted = cap_intersect(new->cap_permitted, old->cap_permitted); } new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; /* File caps or setid cancels ambient. */ if (has_fcap || is_setid) cap_clear(new->cap_ambient); /* * Now that we've computed pA', update pP' to give: * pP' = (X & fP) | (pI & fI) | pA' */ new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient); /* * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set, * this is the same as pE' = (fE ? pP' : 0) | pA'. */ if (effective) new->cap_effective = new->cap_permitted; else new->cap_effective = new->cap_ambient; if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; if (nonroot_raised_pE(new, old, root_uid, has_fcap)) { ret = audit_log_bprm_fcaps(bprm, new, old); if (ret < 0) return ret; } new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; /* Check for privilege-elevated exec. */ if (is_setid || (!__is_real(root_uid, new) && (effective || __cap_grew(permitted, ambient, new)))) bprm->secureexec = 1; return 0; } /** * cap_inode_setxattr - Determine whether an xattr may be altered * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * @value: The value that the xattr will be changed to * @size: The size of value * @flags: The replacement flag * * Determine whether an xattr may be altered or set on an inode, returning 0 if * permission is granted, -ve if denied. * * This is used to make sure security xattrs don't get updated or set by those * who aren't privileged to do so. */ int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; /* * For XATTR_NAME_CAPS the check will be done in * cap_convert_nscap(), called by setxattr() */ if (strcmp(name, XATTR_NAME_CAPS) == 0) return 0; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /** * cap_inode_removexattr - Determine whether an xattr may be removed * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * * Determine whether an xattr may be removed from an inode, returning 0 if * permission is granted, -ve if denied. * * This is used to make sure security xattrs don't get removed by those who * aren't privileged to remove them. */ int cap_inode_removexattr(struct dentry *dentry, const char *name) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; if (strcmp(name, XATTR_NAME_CAPS) == 0) { /* security.capability gets namespaced */ struct inode *inode = d_backing_inode(dentry); if (!inode) return -EINVAL; if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP)) return -EPERM; return 0; } if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /* * cap_emulate_setxuid() fixes the effective / permitted capabilities of * a process after a call to setuid, setreuid, or setresuid. * * 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of * {r,e,s}uid != 0, the permitted and effective capabilities are * cleared. * * 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective * capabilities of the process are cleared. * * 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective * capabilities are set to the permitted capabilities. * * fsuid is handled els