1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 /* SPDX-License-Identifier: GPL-2.0 */ /* * Events for filesystem locks * * Copyright 2013 Jeff Layton <jlayton@poochiereds.net> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filelock #if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILELOCK_H #include <linux/tracepoint.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/kdev_t.h> #define show_fl_flags(val) \ __print_flags(val, "|", \ { FL_POSIX, "FL_POSIX" }, \ { FL_FLOCK, "FL_FLOCK" }, \ { FL_DELEG, "FL_DELEG" }, \ { FL_ACCESS, "FL_ACCESS" }, \ { FL_EXISTS, "FL_EXISTS" }, \ { FL_LEASE, "FL_LEASE" }, \ { FL_CLOSE, "FL_CLOSE" }, \ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ { FL_OFDLCK, "FL_OFDLCK" }) #define show_fl_type(val) \ __print_symbolic(val, \ { F_RDLCK, "F_RDLCK" }, \ { F_WRLCK, "F_WRLCK" }, \ { F_UNLCK, "F_UNLCK" }) TRACE_EVENT(locks_get_lock_context, TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx), TP_ARGS(inode, type, ctx), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(unsigned char, type) __field(struct file_lock_context *, ctx) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->type = type; __entry->ctx = ctx; ), TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, show_fl_type(__entry->type), __entry->ctx) ); DECLARE_EVENT_CLASS(filelock_lock, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock *, fl_blocker) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_pid) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) __field(loff_t, fl_start) __field(loff_t, fl_end) __field(int, ret) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->fl_blocker = fl ? fl->fl_blocker : NULL; __entry->fl_owner = fl ? fl->fl_owner : NULL; __entry->fl_pid = fl ? fl->fl_pid : 0; __entry->fl_flags = fl ? fl->fl_flags : 0; __entry->fl_type = fl ? fl->fl_type : 0; __entry->fl_start = fl ? fl->fl_start : 0; __entry->fl_end = fl ? fl->fl_end : 0; __entry->ret = ret; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->fl_blocker, __entry->fl_owner, __entry->fl_pid, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type), __entry->fl_start, __entry->fl_end, __entry->ret) ); DEFINE_EVENT(filelock_lock, posix_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, fcntl_setlk, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, locks_remove_posix, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, flock_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DECLARE_EVENT_CLASS(filelock_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock *, fl_blocker) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) __field(unsigned long, fl_break_time) __field(unsigned long, fl_downgrade_time) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->fl_blocker = fl ? fl->fl_blocker : NULL; __entry->fl_owner = fl ? fl->fl_owner : NULL; __entry->fl_flags = fl ? fl->fl_flags : 0; __entry->fl_type = fl ? fl->fl_type : 0; __entry->fl_break_time = fl ? fl->fl_break_time : 0; __entry->fl_downgrade_time = fl ? fl->fl_downgrade_time : 0; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->fl_blocker, __entry->fl_owner, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type), __entry->fl_break_time, __entry->fl_downgrade_time) ); DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); TRACE_EVENT(generic_add_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(int, wcount) __field(int, rcount) __field(int, icount) __field(dev_t, s_dev) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); __entry->icount = atomic_read(&inode->i_count); __entry->fl_owner = fl->fl_owner; __entry->fl_flags = fl->fl_flags; __entry->fl_type = fl->fl_type; ), TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->wcount, __entry->rcount, __entry->icount, __entry->fl_owner, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type)) ); TRACE_EVENT(leases_conflict, TP_PROTO(bool conflict, struct file_lock *lease, struct file_lock *breaker), TP_ARGS(conflict, lease, breaker), TP_STRUCT__entry( __field(void *, lease) __field(void *, breaker) __field(unsigned int, l_fl_flags) __field(unsigned int, b_fl_flags) __field(unsigned char, l_fl_type) __field(unsigned char, b_fl_type) __field(bool, conflict) ), TP_fast_assign( __entry->lease = lease; __entry->l_fl_flags = lease->fl_flags; __entry->l_fl_type = lease->fl_type; __entry->breaker = breaker; __entry->b_fl_flags = breaker->fl_flags; __entry->b_fl_type = breaker->fl_type; __entry->conflict = conflict; ), TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s", __entry->conflict, __entry->lease, show_fl_flags(__entry->l_fl_flags), show_fl_type(__entry->l_fl_type), __entry->breaker, show_fl_flags(__entry->b_fl_flags), show_fl_type(__entry->b_fl_type)) ); #endif /* _TRACE_FILELOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H /* * Tag address space map. */ struct blk_mq_tags { unsigned int nr_tags; unsigned int nr_reserved_tags; atomic_t active_queues; struct sbitmap_queue *bitmap_tags; struct sbitmap_queue *breserved_tags; struct sbitmap_queue __bitmap_tags; struct sbitmap_queue __breserved_tags; struct request **rqs; struct request **static_rqs; struct list_head page_list; /* * used to clear request reference in rqs[] before freeing one * request pool */ spinlock_t lock; }; extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, unsigned int flags); extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags); extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, struct blk_mq_hw_ctx *hctx) { if (!hctx) return &bt->ws[0]; return sbq_wait_ptr(bt, &hctx->wait_index); } enum { BLK_MQ_NO_TAG = -1U, BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return false; return __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return; __blk_mq_tag_idle(hctx); } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { return tag < tags->nr_reserved_tags; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tcp #if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TCP_H #include <linux/ipv6.h> #include <linux/tcp.h> #include <linux/tracepoint.h> #include <net/ipv6.h> #include <net/tcp.h> #include <linux/sock_diag.h> #define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ do { \ struct in6_addr *pin6; \ \ pin6 = (struct in6_addr *)__entry->saddr_v6; \ ipv6_addr_set_v4mapped(saddr, pin6); \ pin6 = (struct in6_addr *)__entry->daddr_v6; \ ipv6_addr_set_v4mapped(daddr, pin6); \ } while (0) #if IS_ENABLED(CONFIG_IPV6) #define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ do { \ if (sk->sk_family == AF_INET6) { \ struct in6_addr *pin6; \ \ pin6 = (struct in6_addr *)__entry->saddr_v6; \ *pin6 = saddr6; \ pin6 = (struct in6_addr *)__entry->daddr_v6; \ *pin6 = daddr6; \ } else { \ TP_STORE_V4MAPPED(__entry, saddr, daddr); \ } \ } while (0) #else #define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ TP_STORE_V4MAPPED(__entry, saddr, daddr) #endif /* * tcp event with arguments sk and skb * * Note: this class requires a valid sk pointer; while skb pointer could * be NULL. */ DECLARE_EVENT_CLASS(tcp_event_sk_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(const void *, skbaddr) __field(const void *, skaddr) __field(int, state) __field(__u16, sport) __field(__u16, dport) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skbaddr = skb; __entry->skaddr = sk; __entry->state = sk->sk_state; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s", __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->state)) ); DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); /* * skb of trace_tcp_send_reset is the skb that caused RST. In case of * active reset, skb should be NULL */ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); /* * tcp event with arguments sk * * Note: this class requires a valid sk pointer. */ DECLARE_EVENT_CLASS(tcp_event_sk, TP_PROTO(struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) __field(__u64, sock_cookie) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); __entry->sock_cookie = sock_gen_cookie(sk); ), TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx", __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->sock_cookie) ); DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), TP_ARGS(sk, req), TP_STRUCT__entry( __field(const void *, skaddr) __field(const void *, req) __field(__u16, sport) __field(__u16, dport) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( struct inet_request_sock *ireq = inet_rsk(req); __be32 *p32; __entry->skaddr = sk; __entry->req = req; __entry->sport = ireq->ir_num; __entry->dport = ntohs(ireq->ir_rmt_port); p32 = (__be32 *) __entry->saddr; *p32 = ireq->ir_loc_addr; p32 = (__be32 *) __entry->daddr; *p32 = ireq->ir_rmt_addr; TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr); ), TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6) ); #include <trace/events/net_probe_common.h> TRACE_EVENT(tcp_probe, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u32, mark) __field(__u16, data_len) __field(__u32, snd_nxt) __field(__u32, snd_una) __field(__u32, snd_cwnd) __field(__u32, ssthresh) __field(__u32, snd_wnd) __field(__u32, srtt) __field(__u32, rcv_wnd) __field(__u64, sock_cookie) ), TP_fast_assign( const struct tcphdr *th = (const struct tcphdr *)skb->data; const struct inet_sock *inet = inet_sk(sk); const struct tcp_sock *tp = tcp_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->mark = skb->mark; __entry->data_len = skb->len - __tcp_hdrlen(th); __entry->snd_nxt = tp->snd_nxt; __entry->snd_una = tp->snd_una; __entry->snd_cwnd = tp->snd_cwnd; __entry->snd_wnd = tp->snd_wnd; __entry->rcv_wnd = tp->rcv_wnd; __entry->ssthresh = tcp_current_ssthresh(sk); __entry->srtt = tp->srtt_us >> 3; __entry->sock_cookie = sock_gen_cookie(sk); ), TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", __entry->saddr, __entry->daddr, __entry->mark, __entry->data_len, __entry->snd_nxt, __entry->snd_una, __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie) ); #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H #include <linux/mem_encrypt.h> #include <asm/page.h> #include <asm/pgtable_types.h> /* * Macro to mark a page protection value as UC- */ #define pgprot_noncached(prot) \ ((boot_cpu_data.x86 > 3) \ ? (__pgprot(pgprot_val(prot) | \ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) /* * Macros to add or remove encryption attribute */ #define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) #define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) #ifndef __ASSEMBLY__ #include <asm/x86_init.h> #include <asm/fpu/xstate.h> #include <asm/fpu/api.h> #include <asm-generic/pgtable_uffd.h> extern pgd_t early_top_pgt[PTRS_PER_PGD]; bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user); void ptdump_walk_pgd_level_checkwx(void); void ptdump_walk_user_pgd_level_checkwx(void); #ifdef CONFIG_DEBUG_WX #define debug_checkwx() ptdump_walk_pgd_level_checkwx() #define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx() #else #define debug_checkwx() do { } while (0) #define debug_checkwx_user() do { } while (0) #endif /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __visible; #define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) extern spinlock_t pgd_lock; extern struct list_head pgd_list; extern struct mm_struct *pgd_page_get_mm(struct page *page); extern pmdval_t early_pmd_flags; #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else /* !CONFIG_PARAVIRT_XXL */ #define set_pte(ptep, pte) native_set_pte(ptep, pte) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) #define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) #ifndef __PAGETABLE_P4D_FOLDED #define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) #define pgd_clear(pgd) (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0) #endif #ifndef set_p4d # define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d) #endif #ifndef __PAGETABLE_PUD_FOLDED #define p4d_clear(p4d) native_p4d_clear(p4d) #endif #ifndef set_pud # define set_pud(pudp, pud) native_set_pud(pudp, pud) #endif #ifndef __PAGETABLE_PUD_FOLDED #define pud_clear(pud) native_pud_clear(pud) #endif #define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) #define pmd_clear(pmd) native_pmd_clear(pmd) #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) #ifndef __PAGETABLE_P4D_FOLDED #define p4d_val(x) native_p4d_val(x) #define __p4d(x) native_make_p4d(x) #endif #ifndef __PAGETABLE_PUD_FOLDED #define pud_val(x) native_pud_val(x) #define __pud(x) native_make_pud(x) #endif #ifndef __PAGETABLE_PMD_FOLDED #define pmd_val(x) native_pmd_val(x) #define __pmd(x) native_make_pmd(x) #endif #define pte_val(x) native_pte_val(x) #define __pte(x) native_make_pte(x) #define arch_end_context_switch(prev) do {} while(0) #endif /* CONFIG_PARAVIRT_XXL */ /* * The following only work if pte_present() is true. * Undefined behaviour if not.. */ static inline int pte_dirty(pte_t pte) { return pte_flags(pte) & _PAGE_DIRTY; } static inline u32 read_pkru(void) { if (boot_cpu_has(X86_FEATURE_OSPKE)) return rdpkru(); return 0; } static inline void write_pkru(u32 pkru) { struct pkru_state *pk; if (!boot_cpu_has(X86_FEATURE_OSPKE)) return; pk = get_xsave_addr(&current->thread.fpu.state.xsave, XFEATURE_PKRU); /* * The PKRU value in xstate needs to be in sync with the value that is * written to the CPU. The FPU restore on return to userland would * otherwise load the previous value again. */ fpregs_lock(); if (pk) pk->pkru = pkru; __write_pkru(pkru); fpregs_unlock(); } static inline int pte_young(pte_t pte) { return pte_flags(pte) & _PAGE_ACCESSED; } static inline int pmd_dirty(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_DIRTY; } static inline int pmd_young(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_ACCESSED; } static inline int pud_dirty(pud_t pud) { return pud_flags(pud) & _PAGE_DIRTY; } static inline int pud_young(pud_t pud) { return pud_flags(pud) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return pte_flags(pte) & _PAGE_RW; } static inline int pte_huge(pte_t pte) { return pte_flags(pte) & _PAGE_PSE; } static inline int pte_global(pte_t pte) { return pte_flags(pte) & _PAGE_GLOBAL; } static inline int pte_exec(pte_t pte) { return !(pte_flags(pte) & _PAGE_NX); } static inline int pte_special(pte_t pte) { return pte_flags(pte) & _PAGE_SPECIAL; } /* Entries that were set to PROT_NONE are inverted */ static inline u64 protnone_mask(u64 val); static inline unsigned long pte_pfn(pte_t pte) { phys_addr_t pfn = pte_val(pte); pfn ^= protnone_mask(pfn); return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; } static inline unsigned long pmd_pfn(pmd_t pmd) { phys_addr_t pfn = pmd_val(pmd); pfn ^= protnone_mask(pfn); return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_pfn(pud_t pud) { phys_addr_t pfn = pud_val(pud); pfn ^= protnone_mask(pfn); return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT; } static inline unsigned long p4d_pfn(p4d_t p4d) { return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; } static inline unsigned long pgd_pfn(pgd_t pgd) { return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; } #define p4d_leaf p4d_large static inline int p4d_large(p4d_t p4d) { /* No 512 GiB pages yet */ return 0; } #define pte_page(pte) pfn_to_page(pte_pfn(pte)) #define pmd_leaf pmd_large static inline int pmd_large(pmd_t pte) { return pmd_flags(pte) & _PAGE_PSE; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */ static inline int pmd_trans_huge(pmd_t pmd) { return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static inline int pud_trans_huge(pud_t pud) { return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; } #endif #define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { return boot_cpu_has(X86_FEATURE_PSE); } #ifdef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pmd_devmap(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_DEVMAP); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static inline int pud_devmap(pud_t pud) { return !!(pud_val(pud) & _PAGE_DEVMAP); } #else static inline int pud_devmap(pud_t pud) { return 0; } #endif static inline int pgd_devmap(pgd_t pgd) { return 0; } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline pte_t pte_set_flags(pte_t pte, pteval_t set) { pteval_t v = native_pte_val(pte); return native_make_pte(v | set); } static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) { pteval_t v = native_pte_val(pte); return native_make_pte(v & ~clear); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { return pte_flags(pte) & _PAGE_UFFD_WP; } static inline pte_t pte_mkuffd_wp(pte_t pte) { return pte_set_flags(pte, _PAGE_UFFD_WP); } static inline pte_t pte_clear_uffd_wp(pte_t pte) { return pte_clear_flags(pte, _PAGE_UFFD_WP); } #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ static inline pte_t pte_mkclean(pte_t pte) { return pte_clear_flags(pte, _PAGE_DIRTY); } static inline pte_t pte_mkold(pte_t pte) { return pte_clear_flags(pte, _PAGE_ACCESSED); } static inline pte_t pte_wrprotect(pte_t pte) { return pte_clear_flags(pte, _PAGE_RW); } static inline pte_t pte_mkexec(pte_t pte) { return pte_clear_flags(pte, _PAGE_NX); } static inline pte_t pte_mkdirty(pte_t pte) { return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) { return pte_set_flags(pte, _PAGE_ACCESSED); } static inline pte_t pte_mkwrite(pte_t pte) { return pte_set_flags(pte, _PAGE_RW); } static inline pte_t pte_mkhuge(pte_t pte) { return pte_set_flags(pte, _PAGE_PSE); } static inline pte_t pte_clrhuge(pte_t pte) { return pte_clear_flags(pte, _PAGE_PSE); } static inline pte_t pte_mkglobal(pte_t pte) { return pte_set_flags(pte, _PAGE_GLOBAL); } static inline pte_t pte_clrglobal(pte_t pte) { return pte_clear_flags(pte, _PAGE_GLOBAL); } static inline pte_t pte_mkspecial(pte_t pte) { return pte_set_flags(pte, _PAGE_SPECIAL); } static inline pte_t pte_mkdevmap(pte_t pte) { return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); } static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) { pmdval_t v = native_pmd_val(pmd); return native_make_pmd(v | set); } static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) { pmdval_t v = native_pmd_val(pmd); return native_make_pmd(v & ~clear); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pmd_uffd_wp(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_UFFD_WP; } static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_UFFD_WP); } static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_UFFD_WP); } #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ static inline pmd_t pmd_mkold(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_ACCESSED); } static inline pmd_t pmd_mkclean(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_DIRTY); } static inline pmd_t pmd_wrprotect(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_RW); } static inline pmd_t pmd_mkdirty(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pmd_t pmd_mkdevmap(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_DEVMAP); } static inline pmd_t pmd_mkhuge(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_PSE); } static inline pmd_t pmd_mkyoung(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_ACCESSED); } static inline pmd_t pmd_mkwrite(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_RW); } static inline pud_t pud_set_flags(pud_t pud, pudval_t set) { pudval_t v = native_pud_val(pud); return native_make_pud(v | set); } static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) { pudval_t v = native_pud_val(pud); return native_make_pud(v & ~clear); } static inline pud_t pud_mkold(pud_t pud) { return pud_clear_flags(pud, _PAGE_ACCESSED); } static inline pud_t pud_mkclean(pud_t pud) { return pud_clear_flags(pud, _PAGE_DIRTY); } static inline pud_t pud_wrprotect(pud_t pud) { return pud_clear_flags(pud, _PAGE_RW); } static inline pud_t pud_mkdirty(pud_t pud) { return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pud_t pud_mkdevmap(pud_t pud) { return pud_set_flags(pud, _PAGE_DEVMAP); } static inline pud_t pud_mkhuge(pud_t pud) { return pud_set_flags(pud, _PAGE_PSE); } static inline pud_t pud_mkyoung(pud_t pud) { return pud_set_flags(pud, _PAGE_ACCESSED); } static inline pud_t pud_mkwrite(pud_t pud) { return pud_set_flags(pud, _PAGE_RW); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline int pte_soft_dirty(pte_t pte) { return pte_flags(pte) & _PAGE_SOFT_DIRTY; } static inline int pmd_soft_dirty(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_SOFT_DIRTY; } static inline int pud_soft_dirty(pud_t pud) { return pud_flags(pud) & _PAGE_SOFT_DIRTY; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte_set_flags(pte, _PAGE_SOFT_DIRTY); } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } static inline pud_t pud_mksoft_dirty(pud_t pud) { return pud_set_flags(pud, _PAGE_SOFT_DIRTY); } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); } static inline pud_t pud_clear_soft_dirty(pud_t pud) { return pud_clear_flags(pud, _PAGE_SOFT_DIRTY); } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. */ static inline pgprotval_t massage_pgprot(pgprot_t pgprot) { pgprotval_t protval = pgprot_val(pgprot); if (protval & _PAGE_PRESENT) protval &= __supported_pte_mask; return protval; } static inline pgprotval_t check_pgprot(pgprot_t pgprot) { pgprotval_t massaged_val = massage_pgprot(pgprot); /* mmdebug.h can not be included here because of dependencies */ #ifdef CONFIG_DEBUG_VM WARN_ONCE(pgprot_val(pgprot) != massaged_val, "attempted to set unsupported pgprot: %016llx " "bits: %016llx supported: %016llx\n", (u64)pgprot_val(pgprot), (u64)pgprot_val(pgprot) ^ massaged_val, (u64)__supported_pte_mask); #endif return massaged_val; } static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PTE_PFN_MASK; return __pte(pfn | check_pgprot(pgprot)); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PHYSICAL_PMD_PAGE_MASK; return __pmd(pfn | check_pgprot(pgprot)); } static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PHYSICAL_PUD_PAGE_MASK; return __pud(pfn | check_pgprot(pgprot)); } static inline pmd_t pmd_mkinvalid(pmd_t pmd) { return pfn_pmd(pmd_pfn(pmd), __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pteval_t val = pte_val(pte), oldval = val; /* * Chop off the NX bit (if present), and add the NX portion of * the newprot (if present): */ val &= _PAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); return __pte(val); } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { pmdval_t val = pmd_val(pmd), oldval = val; val &= _HPAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK); return __pmd(val); } /* * mprotect needs to preserve PAT and encryption bits when updating * vm_page_prot */ #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK; pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK; return __pgprot(preservebits | addbits); } #define pte_pgprot(x) __pgprot(pte_flags(x)) #define pmd_pgprot(x) __pgprot(pmd_flags(x)) #define pud_pgprot(x) __pgprot(pud_flags(x)) #define p4d_pgprot(x) __pgprot(p4d_flags(x)) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) static inline pgprot_t arch_filter_pgprot(pgprot_t prot) { return canon_pgprot(prot); } static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, enum page_cache_mode pcm, enum page_cache_mode new_pcm) { /* * PAT type is always WB for untracked ranges, so no need to check. */ if (x86_platform.is_untracked_pat_range(paddr, paddr + size)) return 1; /* * Certain new memtypes are not allowed with certain * requested memtype: * - request is uncached, return cannot be write-back * - request is write-combine, return cannot be write-back * - request is write-through, return cannot be write-back * - request is write-through, return cannot be write-combine */ if ((pcm == _PAGE_CACHE_MODE_UC_MINUS && new_pcm == _PAGE_CACHE_MODE_WB) || (pcm == _PAGE_CACHE_MODE_WC && new_pcm == _PAGE_CACHE_MODE_WB) || (pcm == _PAGE_CACHE_MODE_WT && new_pcm == _PAGE_CACHE_MODE_WB) || (pcm == _PAGE_CACHE_MODE_WT && new_pcm == _PAGE_CACHE_MODE_WC)) { return 0; } return 1; } pmd_t *populate_extra_pmd(unsigned long vaddr); pte_t *populate_extra_pte(unsigned long vaddr); #ifdef CONFIG_PAGE_TABLE_ISOLATION pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); /* * Take a PGD location (pgdp) and a pgd value that needs to be set there. * Populates the user and returns the resulting PGD that must be set in * the kernel copy of the page tables. */ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { if (!static_cpu_has(X86_FEATURE_PTI)) return pgd; return __pti_set_user_pgtbl(pgdp, pgd); } #else /* CONFIG_PAGE_TABLE_ISOLATION */ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; } #endif /* CONFIG_PAGE_TABLE_ISOLATION */ #endif /* __ASSEMBLY__ */ #ifdef CONFIG_X86_32 # include <asm/pgtable_32.h> #else # include <asm/pgtable_64.h> #endif #ifndef __ASSEMBLY__ #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/log2.h> #include <asm/fixmap.h> static inline int pte_none(pte_t pte) { return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK)); } #define __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t a, pte_t b) { return a.pte == b.pte; } static inline int pte_present(pte_t a) { return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } #ifdef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t a) { return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP; } #endif #define pte_accessible pte_accessible static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { if (pte_flags(a) & _PAGE_PRESENT) return true; if ((pte_flags(a) & _PAGE_PROTNONE) && mm_tlb_flush_pending(mm)) return true; return false; } static inline int pmd_present(pmd_t pmd) { /* * Checking for _PAGE_PSE is needed too because * split_huge_page will temporarily clear the present bit (but * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } #ifdef CONFIG_NUMA_BALANCING /* * These work without NUMA balancing but the kernel does not care. See the * comment in include/linux/pgtable.h */ static inline int pte_protnone(pte_t pte) { return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT)) == _PAGE_PROTNONE; } static inline int pmd_protnone(pmd_t pmd) { return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT)) == _PAGE_PROTNONE; } #endif /* CONFIG_NUMA_BALANCING */ static inline int pmd_none(pmd_t pmd) { /* Only check low word on 32-bit platforms, since it might be out of sync with upper half. */ unsigned long val = native_pmd_val(pmd); return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0; } static inline unsigned long pmd_page_vaddr(pmd_t pmd) { return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. * * (Currently stuck as a macro because of indirect forward reference * to linux/mm.h:page_to_nid()) */ #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) static inline int pmd_bad(pmd_t pmd) { return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; } static inline unsigned long pages_to_mb(unsigned long npg) { return npg >> (20 - PAGE_SHIFT); } #if CONFIG_PGTABLE_LEVELS > 2 static inline int pud_none(pud_t pud) { return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0; } static inline int pud_present(pud_t pud) { return pud_flags(pud) & _PAGE_PRESENT; } static inline unsigned long pud_page_vaddr(pud_t pud) { return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ #define pud_page(pud) pfn_to_page(pud_pfn(pud)) #define pud_leaf pud_large static inline int pud_large(pud_t pud) { return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == (_PAGE_PSE | _PAGE_PRESENT); } static inline int pud_bad(pud_t pud) { return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; } #else #define pud_leaf pud_large static inline int pud_large(pud_t pud) { return 0; } #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 static inline int p4d_none(p4d_t p4d) { return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0; } static inline int p4d_present(p4d_t p4d) { return p4d_flags(p4d) & _PAGE_PRESENT; } static inline unsigned long p4d_page_vaddr(p4d_t p4d) { return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ #define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) static inline int p4d_bad(p4d_t p4d) { unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (p4d_flags(p4d) & ~ignore_flags) != 0; } #endif /* CONFIG_PGTABLE_LEVELS > 3 */ static inline unsigned long p4d_index(unsigned long address) { return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1); } #if CONFIG_PGTABLE_LEVELS > 4 static inline int pgd_present(pgd_t pgd) { if (!pgtable_l5_enabled()) return 1; return pgd_flags(pgd) & _PAGE_PRESENT; } static inline unsigned long pgd_page_vaddr(pgd_t pgd) { return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ #define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { if (!pgtable_l5_enabled()) return (p4d_t *)pgd; return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); } static inline int pgd_bad(pgd_t pgd) { unsigned long ignore_flags = _PAGE_USER; if (!pgtable_l5_enabled()) return 0; if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; } static inline int pgd_none(pgd_t pgd) { if (!pgtable_l5_enabled()) return 0; /* * There is no need to do a workaround for the KNL stray * A/D bit erratum here. PGDs only point to page tables * except on 32-bit non-PAE which is not supported on * KNL. */ return !native_pgd_val(pgd); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* __ASSEMBLY__ */ #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) #ifndef __ASSEMBLY__ extern int direct_gbpages; void init_mem_mapping(void); void early_alloc_pgt_buf(void); extern void memblock_find_dma_reserve(void); void __init poking_init(void); unsigned long init_memory_mapping(unsigned long start, unsigned long end, pgprot_t prot); #ifdef CONFIG_X86_64 extern pgd_t trampoline_pgd_entry; #endif /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) { pte_t res = *ptep; /* Pure native function needs no input for mm, addr */ native_pte_clear(NULL, 0, ptep); return res; } static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp) { pmd_t res = *pmdp; native_pmd_clear(pmdp); return res; } static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) { pud_t res = *pudp; native_pud_clear(pudp); return res; } static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { set_pte(ptep, pte); } static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { set_pmd(pmdp, pmd); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { native_set_pud(pudp, pud); } /* * We only update the dirty/accessed state if we set * the dirty bit by hand in the kernel, since the hardware * will do the accessed bit for us, and we don't want to * race with other CPU's that might be updating the dirty * bit at the same time. */ struct vm_area_struct; #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH extern int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); return pte; } #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) { pte_t pte; if (full) { /* * Full address destruction in progress; paravirt does not * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); } else { pte = ptep_get_and_clear(mm, addr, ptep); } return pte; } #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); } #define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp); extern int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp); #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define pmd_write pmd_write static inline int pmd_write(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_RW; } #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { return native_pmdp_get_and_clear(pmdp); } #define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { return native_pudp_get_and_clear(pudp); } #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); } #define pud_write pud_write static inline int pud_write(pud_t pud) { return pud_flags(pud) & _PAGE_RW; } #ifndef pmdp_establish #define pmdp_establish pmdp_establish static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { pmd_t old = *pmdp; WRITE_ONCE(*pmdp, pmd); return old; } } #endif /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. * * Returns true for parts of the PGD that map userspace and * false for the parts that map the kernel. */ static inline bool pgdp_maps_userspace(void *__ptr) { unsigned long ptr = (unsigned long)__ptr; return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START); } #define pgd_leaf pgd_large static inline int pgd_large(pgd_t pgd) { return 0; } #ifdef CONFIG_PAGE_TABLE_ISOLATION /* * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and * the user one is in the last 4k. To switch between them, you * just need to flip the 12th bit in their addresses. */ #define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT /* * This generates better code than the inline assembly in * __set_bit(). */ static inline void *ptr_set_bit(void *ptr, int bit) { unsigned long __ptr = (unsigned long)ptr; __ptr |= BIT(bit); return (void *)__ptr; } static inline void *ptr_clear_bit(void *ptr, int bit) { unsigned long __ptr = (unsigned long)ptr; __ptr &= ~BIT(bit); return (void *)__ptr; } static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) { return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); } static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) { return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); } static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) { return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); } static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) { return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); } #endif /* CONFIG_PAGE_TABLE_ISOLATION */ /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * * dst - pointer to pgd range anwhere on a pgd page * src - "" * count - the number of pgds to copy. * * dst and src can be on the same page, but the range must not overlap, * and must not cross a page boundary. */ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); #ifdef CONFIG_PAGE_TABLE_ISOLATION if (!static_cpu_has(X86_FEATURE_PTI)) return; /* Clone the user space pgd as well */ memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src), count * sizeof(pgd_t)); #endif } #define PTE_SHIFT ilog2(PTRS_PER_PTE) static inline int page_level_shift(enum pg_level level) { return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT; } static inline unsigned long page_level_size(enum pg_level level) { return 1UL << page_level_shift(level); } static inline unsigned long page_level_mask(enum pg_level level) { return ~(page_level_size(level) - 1); } /* * The x86 doesn't have any external MMU info: the kernel page * tables contain all the necessary information. */ static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { } static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); } static inline int pte_swp_soft_dirty(pte_t pte) { return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY); } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY); } #endif #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline pte_t pte_swp_mkuffd_wp(pte_t pte) { return pte_set_flags(pte, _PAGE_SWP_UFFD_WP); } static inline int pte_swp_uffd_wp(pte_t pte) { return pte_flags(pte) & _PAGE_SWP_UFFD_WP; } static inline pte_t pte_swp_clear_uffd_wp(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP); } static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP); } static inline int pmd_swp_uffd_wp(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP; } static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP); } #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #define PKRU_AD_BIT 0x1u #define PKRU_WD_BIT 0x2u #define PKRU_BITS_PER_PKEY 2 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS extern u32 init_pkru_value; #else #define init_pkru_value 0 #endif static inline bool __pkru_allows_read(u32 pkru, u16 pkey) { int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits)); } static inline bool __pkru_allows_write(u32 pkru, u16 pkey) { int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; /* * Access-disable disables writes too so we need to check * both bits here. */ return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits)); } static inline u16 pte_flags_pkey(unsigned long pte_flags) { #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS /* ifdef to avoid doing 59-bit shift on 32-bit values */ return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0; #else return 0; #endif } static inline bool __pkru_allows_pkey(u16 pkey, bool write) { u32 pkru = read_pkru(); if (!__pkru_allows_read(pkru, pkey)) return false; if (write && !__pkru_allows_write(pkru, pkey)) return false; return true; } /* * 'pteval' can come from a PTE, PMD or PUD. We only check * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the * same value on all 3 types. */ static inline bool __pte_access_permitted(unsigned long pteval, bool write) { unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; if (write) need_pte_bits |= _PAGE_RW; if ((pteval & need_pte_bits) != need_pte_bits) return 0; return __pkru_allows_pkey(pte_flags_pkey(pteval), write); } #define pte_access_permitted pte_access_permitted static inline bool pte_access_permitted(pte_t pte, bool write) { return __pte_access_permitted(pte_val(pte), write); } #define pmd_access_permitted pmd_access_permitted static inline bool pmd_access_permitted(pmd_t pmd, bool write) { return __pte_access_permitted(pmd_val(pmd), write); } #define pud_access_permitted pud_access_permitted static inline bool pud_access_permitted(pud_t pud, bool write) { return __pte_access_permitted(pud_val(pud), write); } #define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1 extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot); static inline bool arch_has_pfn_modify_check(void) { return boot_cpu_has_bug(X86_BUG_L1TF); } #define arch_faults_on_old_pte arch_faults_on_old_pte static inline bool arch_faults_on_old_pte(void) { return false; } #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CFG802154_RDEV_OPS #define __CFG802154_RDEV_OPS #include <net/cfg802154.h> #include "core.h" #include "trace.h" static inline struct net_device * rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, const char *name, unsigned char name_assign_type, int type) { return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name, name_assign_type, type); } static inline void rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, struct net_device *dev) { rdev->ops->del_virtual_intf_deprecated(&rdev->wpan_phy, dev); } static inline int rdev_suspend(struct cfg802154_registered_device *rdev) { int ret; trace_802154_rdev_suspend(&rdev->wpan_phy); ret = rdev->ops->suspend(&rdev->wpan_phy); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_resume(struct cfg802154_registered_device *rdev) { int ret; trace_802154_rdev_resume(&rdev->wpan_phy); ret = rdev->ops->resume(&rdev->wpan_phy); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { int ret; trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type, extended_addr); ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, name_assign_type, type, extended_addr); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_del_virtual_intf(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { int ret; trace_802154_rdev_del_virtual_intf(&rdev->wpan_phy, wpan_dev); ret = rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_channel(struct cfg802154_registered_device *rdev, u8 page, u8 channel) { int ret; trace_802154_rdev_set_channel(&rdev->wpan_phy, page, channel); ret = rdev->ops->set_channel(&rdev->wpan_phy, page, channel); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_cca_mode(struct cfg802154_registered_device *rdev, const struct wpan_phy_cca *cca) { int ret; trace_802154_rdev_set_cca_mode(&rdev->wpan_phy, cca); ret = rdev->ops->set_cca_mode(&rdev->wpan_phy, cca); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_cca_ed_level(struct cfg802154_registered_device *rdev, s32 ed_level) { int ret; trace_802154_rdev_set_cca_ed_level(&rdev->wpan_phy, ed_level); ret = rdev->ops->set_cca_ed_level(&rdev->wpan_phy, ed_level); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_tx_power(struct cfg802154_registered_device *rdev, s32 power) { int ret; trace_802154_rdev_set_tx_power(&rdev->wpan_phy, power); ret = rdev->ops->set_tx_power(&rdev->wpan_phy, power); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_pan_id(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 pan_id) { int ret; trace_802154_rdev_set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); ret = rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_short_addr(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 short_addr) { int ret; trace_802154_rdev_set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); ret = rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_backoff_exponent(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 min_be, u8 max_be) { int ret; trace_802154_rdev_set_backoff_exponent(&rdev->wpan_phy, wpan_dev, min_be, max_be); ret = rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev, min_be, max_be); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_max_csma_backoffs(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 max_csma_backoffs) { int ret; trace_802154_rdev_set_csma_backoffs(&rdev->wpan_phy, wpan_dev, max_csma_backoffs); ret = rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev, max_csma_backoffs); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_max_frame_retries(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, s8 max_frame_retries) { int ret; trace_802154_rdev_set_max_frame_retries(&rdev->wpan_phy, wpan_dev, max_frame_retries); ret = rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev, max_frame_retries); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_lbt_mode(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, bool mode) { int ret; trace_802154_rdev_set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); ret = rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } static inline int rdev_set_ackreq_default(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, bool ackreq) { int ret; trace_802154_rdev_set_ackreq_default(&rdev->wpan_phy, wpan_dev, ackreq); ret = rdev->ops->set_ackreq_default(&rdev->wpan_phy, wpan_dev, ackreq); trace_802154_rdev_return_int(&rdev->wpan_phy, ret); return ret; } #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL /* TODO this is already a nl802154, so move into ieee802154 */ static inline void rdev_get_llsec_table(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, struct ieee802154_llsec_table **table) { rdev->ops->get_llsec_table(&rdev->wpan_phy, wpan_dev, table); } static inline void rdev_lock_llsec_table(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { rdev->ops->lock_llsec_table(&rdev->wpan_phy, wpan_dev); } static inline void rdev_unlock_llsec_table(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { rdev->ops->unlock_llsec_table(&rdev->wpan_phy, wpan_dev); } static inline int rdev_get_llsec_params(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, struct ieee802154_llsec_params *params) { return rdev->ops->get_llsec_params(&rdev->wpan_phy, wpan_dev, params); } static inline int rdev_set_llsec_params(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_params *params, u32 changed) { return rdev->ops->set_llsec_params(&rdev->wpan_phy, wpan_dev, params, changed); } static inline int rdev_add_llsec_key(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_key_id *id, const struct ieee802154_llsec_key *key) { return rdev->ops->add_llsec_key(&rdev->wpan_phy, wpan_dev, id, key); } static inline int rdev_del_llsec_key(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_key_id *id) { return rdev->ops->del_llsec_key(&rdev->wpan_phy, wpan_dev, id); } static inline int rdev_add_seclevel(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_seclevel *sl) { return rdev->ops->add_seclevel(&rdev->wpan_phy, wpan_dev, sl); } static inline int rdev_del_seclevel(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_seclevel *sl) { return rdev->ops->del_seclevel(&rdev->wpan_phy, wpan_dev, sl); } static inline int rdev_add_device(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_device *dev_desc) { return rdev->ops->add_device(&rdev->wpan_phy, wpan_dev, dev_desc); } static inline int rdev_del_device(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le64 extended_addr) { return rdev->ops->del_device(&rdev->wpan_phy, wpan_dev, extended_addr); } static inline int rdev_add_devkey(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le64 extended_addr, const struct ieee802154_llsec_device_key *devkey) { return rdev->ops->add_devkey(&rdev->wpan_phy, wpan_dev, extended_addr, devkey); } static inline int rdev_del_devkey(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le64 extended_addr, const struct ieee802154_llsec_device_key *devkey) { return rdev->ops->del_devkey(&rdev->wpan_phy, wpan_dev, extended_addr, devkey); } #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ #endif /* __CFG802154_RDEV_OPS */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the multi-level security (MLS) policy. * * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * * Support for enhanced MLS infrastructure. * * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. */ /* * Updated: Hewlett-Packard <paul@paul-moore.com> * * Added support to import/export the MLS label from NetLabel * * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> #include <net/netlabel.h> #include "sidtab.h" #include "mls.h" #include "policydb.h" #include "services.h" /* * Return the length in bytes for the MLS fields of the * security context string representation of `context'. */ int mls_compute_context_len(struct policydb *p, struct context *context) { int i, l, len, head, prev; char *nm; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return 0; len = 1; /* for the beginning ":" */ for (l = 0; l < 2; l++) { int index_sens = context->range.level[l].sens; len += strlen(sym_name(p, SYM_LEVELS, index_sens - 1)); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (head != prev) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } nm = sym_name(p, SYM_CATS, i); len += strlen(nm) + 1; head = i; } prev = i; } if (prev != head) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else len++; } } return len; } /* * Write the security context string representation of * the MLS fields of `context' into the string `*scontext'. * Update `*scontext' to point to the end of the MLS fields. */ void mls_sid_to_context(struct policydb *p, struct context *context, char **scontext) { char *scontextp, *nm; int i, l, head, prev; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return; scontextp = *scontext; *scontextp = ':'; scontextp++; for (l = 0; l < 2; l++) { strcpy(scontextp, sym_name(p, SYM_LEVELS, context->range.level[l].sens - 1)); scontextp += strlen(scontextp); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (prev < 0) *scontextp++ = ':'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, i); strcpy(scontextp, nm); scontextp += strlen(nm); head = i; } prev = i; } if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else *scontextp++ = '-'; } } *scontext = scontextp; return; } int mls_level_isvalid(struct policydb *p, struct mls_level *l) { struct level_datum *levdatum; if (!l->sens || l->sens > p->p_levels.nprim) return 0; levdatum = symtab_search(&p->p_levels, sym_name(p, SYM_LEVELS, l->sens - 1)); if (!levdatum) return 0; /* * Return 1 iff all the bits set in l->cat are also be set in * levdatum->level->cat and no bit in l->cat is larger than * p->p_cats.nprim. */ return ebitmap_contains(&levdatum->level->cat, &l->cat, p->p_cats.nprim); } int mls_range_isvalid(struct policydb *p, struct mls_range *r) { return (mls_level_isvalid(p, &r->level[0]) && mls_level_isvalid(p, &r->level[1]) && mls_level_dom(&r->level[1], &r->level[0])); } /* * Return 1 if the MLS fields in the security context * structure `c' are valid. Return 0 otherwise. */ int mls_context_isvalid(struct policydb *p, struct context *c) { struct user_datum *usrdatum; if (!p->mls_enabled) return 1; if (!mls_range_isvalid(p, &c->range)) return 0; if (c->role == OBJECT_R_VAL) return 1; /* * User must be authorized for the MLS range. */ if (!c->user || c->user > p->p_users.nprim) return 0; usrdatum = p->user_val_to_struct[c->user - 1]; if (!mls_range_contains(usrdatum->range, c->range)) return 0; /* user may not be associated with range */ return 1; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `scontext'. * * This function modifies the string in place, inserting * NULL characters to terminate the MLS fields. * * If a def_sid is provided and no MLS field is present, * copy the MLS field of the associated default context. * Used for upgraded to MLS systems where objects may lack * MLS fields. * * Policy read-lock must be held for sidtab lookup. * */ int mls_context_to_sid(struct policydb *pol, char oldc, char *scontext, struct context *context, struct sidtab *s, u32 def_sid) { char *sensitivity, *cur_cat, *next_cat, *rngptr; struct level_datum *levdatum; struct cat_datum *catdatum, *rngdatum; int l, rc, i; char *rangep[2]; if (!pol->mls_enabled) { /* * With no MLS, only return -EINVAL if there is a MLS field * and it did not come from an xattr. */ if (oldc && def_sid == SECSID_NULL) return -EINVAL; return 0; } /* * No MLS component to the security context, try and map to * default if provided. */ if (!oldc) { struct context *defcon; if (def_sid == SECSID_NULL) return -EINVAL; defcon = sidtab_search(s, def_sid); if (!defcon) return -EINVAL; return mls_context_cpy(context, defcon); } /* * If we're dealing with a range, figure out where the two parts * of the range begin. */ rangep[0] = scontext; rangep[1] = strchr(scontext, '-'); if (rangep[1]) { rangep[1][0] = '\0'; rangep[1]++; } /* For each part of the range: */ for (l = 0; l < 2; l++) { /* Split sensitivity and category set. */ sensitivity = rangep[l]; if (sensitivity == NULL) break; next_cat = strchr(sensitivity, ':'); if (next_cat) *(next_cat++) = '\0'; /* Parse sensitivity. */ levdatum = symtab_search(&pol->p_levels, sensitivity); if (!levdatum) return -EINVAL; context->range.level[l].sens = levdatum->level->sens; /* Extract category set. */ while (next_cat != NULL) { cur_cat = next_cat; next_cat = strchr(next_cat, ','); if (next_cat != NULL) *(next_cat++) = '\0'; /* Separate into range if exists */ rngptr = strchr(cur_cat, '.'); if (rngptr != NULL) { /* Remove '.' */ *rngptr++ = '\0'; } catdatum = symtab_search(&pol->p_cats, cur_cat); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&context->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; /* If range, set all categories in range */ if (rngptr == NULL) continue; rngdatum = symtab_search(&pol->p_cats, rngptr); if (!rngdatum) return -EINVAL; if (catdatum->value >= rngdatum->value) return -EINVAL; for (i = catdatum->value; i < rngdatum->value; i++) { rc = ebitmap_set_bit(&context->range.level[l].cat, i, 1); if (rc) return rc; } } } /* If we didn't see a '-', the range start is also the range end. */ if (rangep[1] == NULL) { context->range.level[1].sens = context->range.level[0].sens; rc = ebitmap_cpy(&context->range.level[1].cat, &context->range.level[0].cat); if (rc) return rc; } return 0; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `str'. This function will allocate temporary memory with the * given constraints of gfp_mask. */ int mls_from_string(struct policydb *p, char *str, struct context *context, gfp_t gfp_mask) { char *tmpstr; int rc; if (!p->mls_enabled) return -EINVAL; tmpstr = kstrdup(str, gfp_mask); if (!tmpstr) { rc = -ENOMEM; } else { rc = mls_context_to_sid(p, ':', tmpstr, context, NULL, SECSID_NULL); kfree(tmpstr); } return rc; } /* * Copies the MLS range `range' into `context'. */ int mls_range_set(struct context *context, struct mls_range *range) { int l, rc = 0; /* Copy the MLS range into the context */ for (l = 0; l < 2; l++) { context->range.level[l].sens = range->level[l].sens; rc = ebitmap_cpy(&context->range.level[l].cat, &range->level[l].cat); if (rc) break; } return rc; } int mls_setup_user_range(struct policydb *p, struct context *fromcon, struct user_datum *user, struct context *usercon) { if (p->mls_enabled) { struct mls_level *fromcon_sen = &(fromcon->range.level[0]); struct mls_level *fromcon_clr = &(fromcon->range.level[1]); struct mls_level *user_low = &(user->range.level[0]); struct mls_level *user_clr = &(user->range.level[1]); struct mls_level *user_def = &(user->dfltlevel); struct mls_level *usercon_sen = &(usercon->range.level[0]); struct mls_level *usercon_clr = &(usercon->range.level[1]); /* Honor the user's default level if we can */ if (mls_level_between(user_def, fromcon_sen, fromcon_clr)) *usercon_sen = *user_def; else if (mls_level_between(fromcon_sen, user_def, user_clr)) *usercon_sen = *fromcon_sen; else if (mls_level_between(fromcon_clr, user_low, user_def)) *usercon_sen = *user_low; else return -EINVAL; /* Lower the clearance of available contexts if the clearance of "fromcon" is lower than that of the user's default clearance (but only if the "fromcon" clearance dominates the user's computed sensitivity level) */ if (mls_level_dom(user_clr, fromcon_clr)) *usercon_clr = *fromcon_clr; else if (mls_level_dom(fromcon_clr, user_clr)) *usercon_clr = *user_clr; else return -EINVAL; } return 0; } /* * Convert the MLS fields in the security context * structure `oldc' from the values specified in the * policy `oldp' to the values specified in the policy `newp', * storing the resulting context in `newc'. */ int mls_convert_context(struct policydb *oldp, struct policydb *newp, struct context *oldc, struct context *newc) { struct level_datum *levdatum; struct cat_datum *catdatum; struct ebitmap_node *node; int l, i; if (!oldp->mls_enabled || !newp->mls_enabled) return 0; for (l = 0; l < 2; l++) { char *name = sym_name(oldp, SYM_LEVELS, oldc->range.level[l].sens - 1); levdatum = symtab_search(&newp->p_levels, name); if (!levdatum) return -EINVAL; newc->range.level[l].sens = levdatum->level->sens; ebitmap_for_each_positive_bit(&oldc->range.level[l].cat, node, i) { int rc; catdatum = symtab_search(&newp->p_cats, sym_name(oldp, SYM_CATS, i)); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&newc->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; } } return 0; } int mls_compute_sid(struct policydb *p, struct context *scontext, struct context *tcontext, u16 tclass, u32 specified, struct context *newcontext, bool sock) { struct range_trans rtr; struct mls_range *r; struct class_datum *cladatum; int default_range = 0; if (!p->mls_enabled) return 0; switch (specified) { case AVTAB_TRANSITION: /* Look for a range transition rule. */ rtr.source_type = scontext->type; rtr.target_type = tcontext->type; rtr.target_class = tclass; r = policydb_rangetr_search(p, &rtr); if (r) return mls_range_set(newcontext, r); if (tclass && tclass <= p->p_classes.nprim) { cladatum = p->class_val_to_struct[tclass - 1]; if (cladatum) default_range = cladatum->default_range; } switch (default_range) { case DEFAULT_SOURCE_LOW: return mls_context_cpy_low(newcontext, scontext); case DEFAULT_SOURCE_HIGH: return mls_context_cpy_high(newcontext, scontext); case DEFAULT_SOURCE_LOW_HIGH: return mls_context_cpy(newcontext, scontext); case DEFAULT_TARGET_LOW: return mls_context_cpy_low(newcontext, tcontext); case DEFAULT_TARGET_HIGH: return mls_context_cpy_high(newcontext, tcontext); case DEFAULT_TARGET_LOW_HIGH: return mls_context_cpy(newcontext, tcontext); case DEFAULT_GLBLUB: return mls_context_glblub(newcontext, scontext, tcontext); } fallthrough; case AVTAB_CHANGE: if ((tclass == p->process_class) || sock) /* Use the process MLS attributes. */ return mls_context_cpy(newcontext, scontext); else /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); case AVTAB_MEMBER: /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); } return -EINVAL; } #ifdef CONFIG_NETLABEL /** * mls_export_netlbl_lvl - Export the MLS sensitivity levels to NetLabel * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS sensitivity level into the * NetLabel MLS sensitivity level field. * */ void mls_export_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; secattr->attr.mls.lvl = context->range.level[0].sens - 1; secattr->flags |= NETLBL_SECATTR_MLS_LVL; } /** * mls_import_netlbl_lvl - Import the NetLabel MLS sensitivity levels * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context and the NetLabel security attributes, copy the * NetLabel MLS sensitivity level into the context. * */ void mls_import_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; context->range.level[0].sens = secattr->attr.mls.lvl + 1; context->range.level[1].sens = context->range.level[0].sens; } /** * mls_export_netlbl_cat - Export the MLS categories to NetLabel * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS categories into the NetLabel * MLS category field. Returns zero on success, negative values on failure. * */ int mls_export_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_export(&context->range.level[0].cat, &secattr->attr.mls.cat); if (rc == 0 && secattr->attr.mls.cat != NULL) secattr->flags |= NETLBL_SECATTR_MLS_CAT; return rc; } /** * mls_import_netlbl_cat - Import the MLS categories from NetLabel * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Copy the NetLabel security attributes into the SELinux context; since the * NetLabel security attribute only contains a single MLS category use it for * both the low and high categories of the context. Returns zero on success, * negative values on failure. * */ int mls_import_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_import(&context->range.level[0].cat, secattr->attr.mls.cat); if (rc) goto import_netlbl_cat_failure; memcpy(&context->range.level[1].cat, &context->range.level[0].cat, sizeof(context->range.level[0].cat)); return 0; import_netlbl_cat_failure: ebitmap_destroy(&context->range.level[0].cat); return rc; } #endif /* CONFIG_NETLABEL */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ALSA sequencer Timer * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> */ #ifndef __SND_SEQ_TIMER_H #define __SND_SEQ_TIMER_H #include <sound/timer.h> #include <sound/seq_kernel.h> struct snd_seq_timer_tick { snd_seq_tick_time_t cur_tick; /* current tick */ unsigned long resolution; /* time per tick in nsec */ unsigned long fraction; /* current time per tick in nsec */ }; struct snd_seq_timer { /* ... tempo / offset / running state */ unsigned int running:1, /* running state of queue */ initialized:1; /* timer is initialized */ unsigned int tempo; /* current tempo, us/tick */ int ppq; /* time resolution, ticks/quarter */ snd_seq_real_time_t cur_time; /* current time */ struct snd_seq_timer_tick tick; /* current tick */ int tick_updated; int type; /* timer type */ struct snd_timer_id alsa_id; /* ALSA's timer ID */ struct snd_timer_instance *timeri; /* timer instance */ unsigned int ticks; unsigned long preferred_resolution; /* timer resolution, ticks/sec */ unsigned int skew; unsigned int skew_base; struct timespec64 last_update; /* time of last clock update, used for interpolation */ spinlock_t lock; }; /* create new timer (constructor) */ struct snd_seq_timer *snd_seq_timer_new(void); /* delete timer (destructor) */ void snd_seq_timer_delete(struct snd_seq_timer **tmr); /* */ static inline void snd_seq_timer_update_tick(struct snd_seq_timer_tick *tick, unsigned long resolution) { if (tick->resolution > 0) { tick->fraction += resolution; tick->cur_tick += (unsigned int)(tick->fraction / tick->resolution); tick->fraction %= tick->resolution; } } /* compare timestamp between events */ /* return 1 if a >= b; otherwise return 0 */ static inline int snd_seq_compare_tick_time(snd_seq_tick_time_t *a, snd_seq_tick_time_t *b) { /* compare ticks */ return (*a >= *b); } static inline int snd_seq_compare_real_time(snd_seq_real_time_t *a, snd_seq_real_time_t *b) { /* compare real time */ if (a->tv_sec > b->tv_sec) return 1; if ((a->tv_sec == b->tv_sec) && (a->tv_nsec >= b->tv_nsec)) return 1; return 0; } static inline void snd_seq_sanity_real_time(snd_seq_real_time_t *tm) { while (tm->tv_nsec >= 1000000000) { /* roll-over */ tm->tv_nsec -= 1000000000; tm->tv_sec++; } } /* increment timestamp */ static inline void snd_seq_inc_real_time(snd_seq_real_time_t *tm, snd_seq_real_time_t *inc) { tm->tv_sec += inc->tv_sec; tm->tv_nsec += inc->tv_nsec; snd_seq_sanity_real_time(tm); } static inline void snd_seq_inc_time_nsec(snd_seq_real_time_t *tm, unsigned long nsec) { tm->tv_nsec += nsec; snd_seq_sanity_real_time(tm); } /* called by timer isr */ struct snd_seq_queue; int snd_seq_timer_open(struct snd_seq_queue *q); int snd_seq_timer_close(struct snd_seq_queue *q); int snd_seq_timer_midi_open(struct snd_seq_queue *q); int snd_seq_timer_midi_close(struct snd_seq_queue *q); void snd_seq_timer_defaults(struct snd_seq_timer *tmr); void snd_seq_timer_reset(struct snd_seq_timer *tmr); int snd_seq_timer_stop(struct snd_seq_timer *tmr); int snd_seq_timer_start(struct snd_seq_timer *tmr); int snd_seq_timer_continue(struct snd_seq_timer *tmr); int snd_seq_timer_set_tempo(struct snd_seq_timer *tmr, int tempo); int snd_seq_timer_set_tempo_ppq(struct snd_seq_timer *tmr, int tempo, int ppq); int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position); int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position); int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base); snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr, bool adjust_ktime); snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr); extern int seq_default_timer_class; extern int seq_default_timer_sclass; extern int seq_default_timer_card; extern int seq_default_timer_device; extern int seq_default_timer_subdevice; extern int seq_default_timer_resolution; #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 /* SPDX-License-Identifier: GPL-2.0-only */ /* * include/linux/idr.h * * 2002-10-18 written by Jim Houston jim.houston@ccur.com * Copyright (C) 2002 by Concurrent Computer Corporation * * Small id to pointer translation service avoiding fixed sized * tables. */ #ifndef __IDR_H__ #define __IDR_H__ #include <linux/radix-tree.h> #include <linux/gfp.h> #include <linux/percpu.h> struct idr { struct radix_tree_root idr_rt; unsigned int idr_base; unsigned int idr_next; }; /* * The IDR API does not expose the tagging functionality of the radix tree * to users. Use tag 0 to track whether a node has free space below it. */ #define IDR_FREE 0 /* Set the IDR flag and the IDR_FREE tag */ #define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \ (1 << (ROOT_TAG_SHIFT + IDR_FREE))) #define IDR_INIT_BASE(name, base) { \ .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \ .idr_base = (base), \ .idr_next = 0, \ } /** * IDR_INIT() - Initialise an IDR. * @name: Name of IDR. * * A freshly-initialised IDR contains no IDs. */ #define IDR_INIT(name) IDR_INIT_BASE(name, 0) /** * DEFINE_IDR() - Define a statically-allocated IDR. * @name: Name of IDR. * * An IDR defined using this macro is ready for use with no additional * initialisation required. It contains no IDs. */ #define DEFINE_IDR(name) struct idr name = IDR_INIT(name) /** * idr_get_cursor - Return the current position of the cyclic allocator * @idr: idr handle * * The value returned is the value that will be next returned from * idr_alloc_cyclic() if it is free (otherwise the search will start from * this position). */ static inline unsigned int idr_get_cursor(const struct idr *idr) { return READ_ONCE(idr->idr_next); } /** * idr_set_cursor - Set the current position of the cyclic allocator * @idr: idr handle * @val: new position * * The next call to idr_alloc_cyclic() will return @val if it is free * (otherwise the search will start from this position). */ static inline void idr_set_cursor(struct idr *idr, unsigned int val) { WRITE_ONCE(idr->idr_next, val); } /** * DOC: idr sync * idr synchronization (stolen from radix-tree.h) * * idr_find() is able to be called locklessly, using RCU. The caller must * ensure calls to this function are made within rcu_read_lock() regions. * Other readers (lock-free or otherwise) and modifications may be running * concurrently. * * It is still required that the caller manage the synchronization and * lifetimes of the items. So if RCU lock-free lookups are used, typically * this would mean that the items have their own locks, or are amenable to * lock-free access; and that the items are freed by RCU (or only freed after * having been deleted from the idr tree *and* a synchronize_rcu() grace * period). */ #define idr_lock(idr) xa_lock(&(idr)->idr_rt) #define idr_unlock(idr) xa_unlock(&(idr)->idr_rt) #define idr_lock_bh(idr) xa_lock_bh(&(idr)->idr_rt) #define idr_unlock_bh(idr) xa_unlock_bh(&(idr)->idr_rt) #define idr_lock_irq(idr) xa_lock_irq(&(idr)->idr_rt) #define idr_unlock_irq(idr) xa_unlock_irq(&(idr)->idr_rt) #define idr_lock_irqsave(idr, flags) \ xa_lock_irqsave(&(idr)->idr_rt, flags) #define idr_unlock_irqrestore(idr, flags) \ xa_unlock_irqrestore(&(idr)->idr_rt, flags) void idr_preload(gfp_t gfp_mask); int idr_alloc(struct idr *, void *ptr, int start, int end, gfp_t); int __must_check idr_alloc_u32(struct idr *, void *ptr, u32 *id, unsigned long max, gfp_t); int idr_alloc_cyclic(struct idr *, void *ptr, int start, int end, gfp_t); void *idr_remove(struct idr *, unsigned long id); void *idr_find(const struct idr *, unsigned long id); int idr_for_each(const struct idr *, int (*fn)(int id, void *p, void *data), void *data); void *idr_get_next(struct idr *, int *nextid); void *idr_get_next_ul(struct idr *, unsigned long *nextid); void *idr_replace(struct idr *, void *, unsigned long id); void idr_destroy(struct idr *); /** * idr_init_base() - Initialise an IDR. * @idr: IDR handle. * @base: The base value for the IDR. * * This variation of idr_init() creates an IDR which will allocate IDs * starting at %base. */ static inline void idr_init_base(struct idr *idr, int base) { INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER); idr->idr_base = base; idr->idr_next = 0; } /** * idr_init() - Initialise an IDR. * @idr: IDR handle. * * Initialise a dynamically allocated IDR. To initialise a * statically allocated IDR, use DEFINE_IDR(). */ static inline void idr_init(struct idr *idr) { idr_init_base(idr, 0); } /** * idr_is_empty() - Are there any IDs allocated? * @idr: IDR handle. * * Return: %true if any IDs have been allocated from this IDR. */ static inline bool idr_is_empty(const struct idr *idr) { return radix_tree_empty(&idr->idr_rt) && radix_tree_tagged(&idr->idr_rt, IDR_FREE); } /** * idr_preload_end - end preload section started with idr_preload() * * Each idr_preload() should be matched with an invocation of this * function. See idr_preload() for details. */ static inline void idr_preload_end(void) { local_unlock(&radix_tree_preloads.lock); } /** * idr_for_each_entry() - Iterate over an IDR's elements of a given type. * @idr: IDR handle. * @entry: The type * to use as cursor * @id: Entry ID. * * @entry and @id do not need to be initialized before the loop, and * after normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry(idr, entry, id) \ for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; id += 1U) /** * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type. * @idr: IDR handle. * @entry: The type * to use as cursor. * @tmp: A temporary placeholder for ID. * @id: Entry ID. * * @entry and @id do not need to be initialized before the loop, and * after normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry_ul(idr, entry, tmp, id) \ for (tmp = 0, id = 0; \ tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \ tmp = id, ++id) /** * idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type * @idr: IDR handle. * @entry: The type * to use as a cursor. * @id: Entry ID. * * Continue to iterate over entries, continuing after the current position. */ #define idr_for_each_entry_continue(idr, entry, id) \ for ((entry) = idr_get_next((idr), &(id)); \ entry; \ ++id, (entry) = idr_get_next((idr), &(id))) /** * idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type * @idr: IDR handle. * @entry: The type * to use as a cursor. * @tmp: A temporary placeholder for ID. * @id: Entry ID. * * Continue to iterate over entries, continuing after the current position. */ #define idr_for_each_entry_continue_ul(idr, entry, tmp, id) \ for (tmp = id; \ tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \ tmp = id, ++id) /* * IDA - ID Allocator, use when translation from id to pointer isn't necessary. */ #define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */ #define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long)) #define IDA_BITMAP_BITS (IDA_BITMAP_LONGS * sizeof(long) * 8) struct ida_bitmap { unsigned long bitmap[IDA_BITMAP_LONGS]; }; struct ida { struct xarray xa; }; #define IDA_INIT_FLAGS (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC) #define IDA_INIT(name) { \ .xa = XARRAY_INIT(name, IDA_INIT_FLAGS) \ } #define DEFINE_IDA(name) struct ida name = IDA_INIT(name) int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t); void ida_free(struct ida *, unsigned int id); void ida_destroy(struct ida *ida); /** * ida_alloc() - Allocate an unused ID. * @ida: IDA handle. * @gfp: Memory allocation flags. * * Allocate an ID between 0 and %INT_MAX, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc(struct ida *ida, gfp_t gfp) { return ida_alloc_range(ida, 0, ~0, gfp); } /** * ida_alloc_min() - Allocate an unused ID. * @ida: IDA handle. * @min: Lowest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between @min and %INT_MAX, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp) { return ida_alloc_range(ida, min, ~0, gfp); } /** * ida_alloc_max() - Allocate an unused ID. * @ida: IDA handle. * @max: Highest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between 0 and @max, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp) { return ida_alloc_range(ida, 0, max, gfp); } static inline void ida_init(struct ida *ida) { xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } /* * ida_simple_get() and ida_simple_remove() are deprecated. Use * ida_alloc() and ida_free() instead respectively. */ #define ida_simple_get(ida, start, end, gfp) \ ida_alloc_range(ida, start, (end) - 1, gfp) #define ida_simple_remove(ida, id) ida_free(ida, id) static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); } #endif /* __IDR_H__ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COOKIE_H #define __LINUX_COOKIE_H #include <linux/atomic.h> #include <linux/percpu.h> #include <asm/local.h> struct pcpu_gen_cookie { local_t nesting; u64 last; } __aligned(16); struct gen_cookie { struct pcpu_gen_cookie __percpu *local; atomic64_t forward_last ____cacheline_aligned_in_smp; atomic64_t reverse_last; }; #define COOKIE_LOCAL_BATCH 4096 #define DEFINE_COOKIE(name) \ static DEFINE_PER_CPU(struct pcpu_gen_cookie, __##name); \ static struct gen_cookie name = { \ .local = &__##name, \ .forward_last = ATOMIC64_INIT(0), \ .reverse_last = ATOMIC64_INIT(0), \ } static __always_inline u64 gen_cookie_next(struct gen_cookie *gc) { struct pcpu_gen_cookie *local = this_cpu_ptr(gc->local); u64 val; if (likely(local_inc_return(&local->nesting) == 1)) { val = local->last; if (__is_defined(CONFIG_SMP) && unlikely((val & (COOKIE_LOCAL_BATCH - 1)) == 0)) { s64 next = atomic64_add_return(COOKIE_LOCAL_BATCH, &gc->forward_last); val = next - COOKIE_LOCAL_BATCH; } local->last = ++val; } else { val = atomic64_dec_return(&gc->reverse_last); } local_dec(&local->nesting); return val; } #endif /* __LINUX_COOKIE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMZONE_H #define _LINUX_MMZONE_H #ifndef __ASSEMBLY__ #ifndef __GENERATING_BOUNDS_H #include <linux/spinlock.h> #include <linux/list.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/cache.h> #include <linux/threads.h> #include <linux/numa.h> #include <linux/init.h> #include <linux/seqlock.h> #include <linux/nodemask.h> #include <linux/pageblock-flags.h> #include <linux/page-flags-layout.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER #define MAX_ORDER 11 #else #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should * coalesce naturally under reasonable reclaim pressure and those which * will not. */ #define PAGE_ALLOC_COSTLY_ORDER 3 enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. What is important though * is that a range of pageblocks must be aligned to * MAX_ORDER_NR_PAGES should biggest page be bigger then * a single pageblock. */ MIGRATE_CMA, #endif #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif MIGRATE_TYPES }; /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false #endif static inline bool is_migrate_movable(int mt) { return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } #define for_each_migratetype_order(order, type) \ for (order = 0; order < MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; #define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1) #define get_pageblock_migratetype(page) \ get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; }; static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { return list_first_entry_or_null(&area->free_list[migratetype], struct page, lru); } static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); } struct pglist_data; /* * zone->lock and the zone lru_lock are two of the hottest locks in the kernel. * So add a wild amount of padding here to ensure that they fall into separate * cachelines. There are very few zone structures in the machine, so space * consumption is not a concern here. */ #if defined(CONFIG_SMP) struct zone_padding { char x[0]; } ____cacheline_internodealigned_in_smp; #define ZONE_PADDING(name) struct zone_padding name; #else #define ZONE_PADDING(name) #endif #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ NUMA_FOREIGN, /* was intended here, hit elsewhere */ NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ NR_VM_NUMA_STAT_ITEMS }; #else #define NR_VM_NUMA_STAT_ITEMS 0 #endif enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ NR_PAGETABLE, /* used for pagetables */ /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif NR_FREE_CMA_PAGES, NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_FILE, WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_FILE, WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_FILE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_VM_NODE_STAT_ITEMS }; /* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation * might be different. */ static __always_inline bool vmstat_item_in_bytes(int idx) { /* * Global and per-node slab counters track slab pages. * It's expected that changes are multiples of PAGE_SIZE. * Internally values are stored in pages. * * Per-memcg and per-lruvec counters track memory, consumed * by individual slab objects. These counters are actually * byte-precise. */ return (idx == NR_SLAB_RECLAIMABLE_B || idx == NR_SLAB_UNRECLAIMABLE_B); } /* * We do arithmetic on the LRU lists in various places in the code, * so it is important to keep the active lists LRU_ACTIVE higher in * the array than the corresponding inactive lists, and to keep * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. * * This has to be kept in sync with the statistics in zone_stat_item * above and the descriptions in vmstat_text in mm/vmstat.c */ #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) static inline bool is_file_lru(enum lru_list lru) { return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } #define ANON_AND_FILE 2 enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI */ }; struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU * increases, the reclaim scan balance tips toward the other. */ unsigned long anon_cost; unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif }; /* Isolate unmapped pages */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) /* Isolate unevictable pages */ #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) /* LRU Isolation modes. */ typedef unsigned __bitwise isolate_mode_t; enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, NR_WMARK }; #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[MIGRATE_PCPTYPES]; }; struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; #endif }; struct per_cpu_nodestat { s8 stat_threshold; s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; }; #endif /* !__GENERATING_BOUNDS.H */ enum zone_type { /* * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able * to DMA to all of the addressable memory (ZONE_NORMAL). * On architectures where this area covers the whole 32 bit address * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller * DMA addressing constraints. This distinction is important as a 32bit * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations. */ #ifdef CONFIG_ZONE_DMA ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif /* * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains * movable pages with few exceptional cases described below. Main use * cases for ZONE_MOVABLE are to make memory offlining/unplug more * likely to succeed, and to locally limit unmovable allocations - e.g., * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might * essentially turn such pages unmovable. Memory offlining might * retry a long time. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. * 3. Memory holes: kernelcore/movablecore setups might create very rare * situations where ZONE_MOVABLE contains memory holes after boot, * for example, if we have sections that are only partially * populated. Memory offlining and allocations fail early. * 4. PG_hwpoison pages: while poisoned pages can be skipped during * memory offlining, such pages cannot be allocated. * 5. Unmovable PG_offline pages: in paravirtualized environments, * hotplugged memory blocks might only partially be managed by the * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The * parts not manged by the buddy are unmovable PG_offline pages. In * some cases (virtio-mem), such pages can be skipped during * memory offlining, however, cannot be moved/allocated. These * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) * have to expect that migrating pages in ZONE_MOVABLE can fail (even * if has_unmovable_pages() states that there are no unmovable pages, * there can be false negatives). */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES }; #ifndef __GENERATING_BOUNDS_H #define ASYNC_AND_SYNC 2 struct zone { /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long _watermark[NR_WMARK]; unsigned long watermark_boost; unsigned long nr_reserved_highatomic; /* * We don't know if the memory that we're going to allocate will be * freeable or/and it will be released eventually, so to avoid totally * wasting several GB of ram we must reserve some of the lower zone * memory (otherwise we risk to run OOM on the lower zones despite * there being tons of freeable ram on the higher zones). This array is * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl * changes. */ long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NEED_MULTIPLE_NODES int node; #endif struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/end(). Any reader who can't tolerant drift of * present_pages should get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; const char *name; #ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock. */ unsigned long nr_isolate_pageblock; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif int initialized; /* Write-intensive fields used from the page allocator */ ZONE_PADDING(_pad1_) /* free areas of different sizes */ struct free_area free_area[MAX_ORDER]; /* zone flags, see below */ unsigned long flags; /* Primarily protects free_area */ spinlock_t lock; /* Write-intensive fields used by compaction and vmstats. */ ZONE_PADDING(_pad2_) /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */ unsigned long percpu_drift_mark; #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; /* pfn where compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. * compact_order_failed is the minimum compaction failed order. */ unsigned int compact_considered; unsigned int compact_defer_shift; int compact_order_failed; #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif bool contiguous; ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; enum zone_flags { ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken. */ }; static inline unsigned long zone_managed_pages(struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; } static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) { return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } static inline bool zone_is_initialized(struct zone *zone) { return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) { return zone->spanned_pages == 0; } /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ static inline bool zone_intersects(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) return false; if (start_pfn >= zone_end_pfn(zone) || start_pfn + nr_pages <= zone->zone_start_pfn) return false; return true; } /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. */ #define DEF_PRIORITY 12 /* Maximum number of zones on a zonelist */ #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) enum { ZONELIST_FALLBACK, /* zonelist with fallback */ #ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled because we need zonelists that * restrict the allocations to a single node for __GFP_THISNODE. */ ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ #endif MAX_ZONELISTS }; /* * This struct contains information about a zone in a zonelist. It is stored * here to avoid dereferences into large structures and lookups of tables */ struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ }; /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; }; #ifndef CONFIG_DISCONTIGMEM /* The array of struct pages - for discontigmem use pgdat->lmem_map */ extern struct page *mem_map; #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split { spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; }; #endif /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ typedef struct pglist_data { /* * node_zones contains just the zones for THIS node. Not all of the * zones may be populated, but it is the full list. It is referenced by * this node's node_zonelists as well as other node's node_zonelists. */ struct zone node_zones[MAX_NR_ZONES]; /* * node_zonelists contains references to all zones in all nodes. * Generally the first zones will be references to this node's * node_zones. */ struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; /* number of populated zones in this node */ #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; #endif #endif #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. * Also synchronizes pgdat->first_deferred_pfn during deferred page * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; #endif /* * This is a per-node reserve of pages that are not available * to userspace allocations. */ unsigned long totalreserve_pages; #ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ ZONE_PADDING(_pad1_) spinlock_t lru_lock; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif /* Fields commonly accessed by the page reclaim scanner */ /* * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. * * Use mem_cgroup_lruvec() to look up lruvecs. */ struct lruvec __lruvec; unsigned long flags; ZONE_PADDING(_pad2_) /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) #ifdef CONFIG_FLAT_NODE_MEM_MAP #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) #else #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) #endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; } static inline bool pgdat_is_empty(pg_data_t *pgdat) { return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; } #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. */ enum meminit_context { MEMINIT_EARLY, MEMINIT_HOTPLUG, }; extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); extern void lruvec_init(struct lruvec *lruvec); static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) { #ifdef CONFIG_MEMCG return lruvec->pgdat; #else return container_of(lruvec, struct pglist_data, __lruvec); #endif } extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else static inline int local_memory_node(int node_id) { return node_id; }; #endif /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ static inline bool managed_zone(struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ static inline bool populated_zone(struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NEED_MULTIPLE_NODES static inline int zone_to_nid(struct zone *zone) { return zone->node; } static inline void zone_set_nid(struct zone *zone, int nid) { zone->node = nid; } #else static inline int zone_to_nid(struct zone *zone) { return 0; } static inline void zone_set_nid(struct zone *zone, int nid) {} #endif extern int movable_zone; #ifdef CONFIG_HIGHMEM static inline int zone_movable_is_highmem(void) { #ifdef CONFIG_NEED_MULTIPLE_NODES return movable_zone == ZONE_HIGHMEM; #else return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; #endif } #endif static inline int is_highmem_idx(enum zone_type idx) { #ifdef CONFIG_HIGHMEM return (idx == ZONE_HIGHMEM || (idx == ZONE_MOVABLE && zone_movable_is_highmem())); #else return 0; #endif } /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone - pointer to struct zone variable */ static inline int is_highmem(struct zone *zone) { #ifdef CONFIG_HIGHMEM return is_highmem_idx(zone_idx(zone)); #else return 0; #endif } /* These two functions are used to setup the per zone pages min values */ struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int numa_zonelist_order_handler(struct ctl_table *, int, void *, size_t *, loff_t *); extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 #ifndef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data contig_page_data; #define NODE_DATA(nid) (&contig_page_data) #define NODE_MEM_MAP(nid) mem_map #else /* CONFIG_NEED_MULTIPLE_NODES */ #include <asm/mmzone.h> #endif /* !CONFIG_NEED_MULTIPLE_NODES */ extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); extern struct zone *next_zone(struct zone *zone); /** * for_each_online_pgdat - helper macro to iterate over all online nodes * @pgdat - pointer to a pg_data_t variable */ #define for_each_online_pgdat(pgdat) \ for (pgdat = first_online_pgdat(); \ pgdat; \ pgdat = next_online_pgdat(pgdat)) /** * for_each_zone - helper macro to iterate over all memory zones * @zone - pointer to struct zone variable * * The user only needs to declare the zone variable, for_each_zone * fills it in. */ #define for_each_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) #define for_each_populated_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) \ if (!populated_zone(zone)) \ ; /* do nothing */ \ else static inline struct zone *zonelist_zone(struct zoneref *zoneref) { return zoneref->zone; } static inline int zonelist_zone_idx(struct zoneref *zoneref) { return zoneref->zone_idx; } static inline int zonelist_node_idx(struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes); /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z - The cursor used as a starting point for the search * @highest_zoneidx - The zone index of the highest zone to return * @nodes - An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. */ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) return z; return __next_zones_zonelist(z, highest_zoneidx, nodes); } /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist * @zonelist - The zonelist to search for a suitable zone * @highest_zoneidx - The zone index of the highest zone to return * @nodes - An optional nodemask to filter the zonelist with * @return - Zoneref pointer for the first suitable zone found (see below) * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be * used to iterate the zonelist with next_zones_zonelist by advancing it by * one before calling. * * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes) { return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); } /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask * @zone - The current zone in the iterator * @z - The current pointer within zonelist->_zonerefs being iterated * @zlist - The zonelist being iterated * @highidx - The zone index of the highest zone to return * @nodemask - Nodemask allowed by the allocator * * This iterator iterates though all zones at or below a given zone index and * within a given nodemask */ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ for (zone = z->zone; \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index * @zone - The current zone in the iterator * @z - The current pointer within zonelist->zones being iterated * @zlist - The zonelist being iterated * @highidx - The zone index of the highest zone to return * * This iterator iterates though all zones at or below a given zone index. */ #define for_each_zone_zonelist(zone, z, zlist, highidx) \ for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) #ifdef CONFIG_SPARSEMEM #include <asm/sparsemem.h> #endif #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif #ifdef CONFIG_SPARSEMEM /* * SECTION_SHIFT #bits space required to store a section # * * PA_SECTION_SHIFT physical address to/from section number * PFN_SECTION_SHIFT pfn to/from section number */ #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS #error Allocator MAX_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) { return pfn >> PFN_SECTION_SHIFT; } static inline unsigned long section_nr_to_pfn(unsigned long sec) { return sec << PFN_SECTION_SHIFT; } #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) #define SUBSECTION_SHIFT 21 #define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT) #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) #define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1)) #if SUBSECTION_SHIFT > SECTION_SIZE_BITS #error Subsection size exceeds section size #else #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) #endif #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { #ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif /* See declaration of similar field in struct zone */ unsigned long pageblock_flags[0]; }; void subsection_map_init(unsigned long pfn, unsigned long nr_pages); struct page; struct page_ext; struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. * (see sparse.c::sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. * (see sparse.c::memory_present()) * * Making it a UL at least makes someone do a cast * before using it wrong. */ unsigned long section_mem_map; struct mem_section_usage *usage; #ifdef CONFIG_PAGE_EXTENSION /* * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use * section. (see page_ext.h about this.) */ struct page_ext *page_ext; unsigned long pad; #endif /* * WARNING: mem_section must be a power-of-2 in size for the * calculation and use of SECTION_ROOT_MASK to make sense. */ }; #ifdef CONFIG_SPARSEMEM_EXTREME #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) #else #define SECTIONS_PER_ROOT 1 #endif #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) #ifdef CONFIG_SPARSEMEM_EXTREME extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif static inline unsigned long *section_to_usemap(struct mem_section *ms) { return ms->usage->pageblock_flags; } static inline struct mem_section *__nr_to_section(unsigned long nr) { #ifdef CONFIG_SPARSEMEM_EXTREME if (!mem_section) return NULL; #endif if (!mem_section[SECTION_NR_TO_ROOT(nr)]) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } extern unsigned long __section_nr(struct mem_section *ms); extern size_t mem_section_usage_size(void); /* * We use the lower bits of the mem_map pointer to store * a little bit of information. The pointer is calculated * as mem_map - section_nr_to_pfn(pnum). The result is * aligned to the minimum alignment of the two values: * 1. All mem_map arrays are page-aligned. * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT * lowest bits. PFN_SECTION_SHIFT is arch-specific * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. * To sum it up, at least 6 bits are available. */ #define SECTION_MARKED_PRESENT (1UL<<0) #define SECTION_HAS_MEM_MAP (1UL<<1) #define SECTION_IS_ONLINE (1UL<<2) #define SECTION_IS_EARLY (1UL<<3) #define SECTION_MAP_LAST_BIT (1UL<<4) #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) #define SECTION_NID_SHIFT 3 static inline struct page *__section_mem_map_addr(struct mem_section *section) { unsigned long map = section->section_mem_map; map &= SECTION_MAP_MASK; return (struct page *)map; } static inline int present_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } static inline int present_section_nr(unsigned long nr) { return present_section(__nr_to_section(nr)); } static inline int valid_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } static inline int early_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } static inline int valid_section_nr(unsigned long nr) { return valid_section(__nr_to_section(nr)); } static inline int online_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); } #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #ifdef CONFIG_MEMORY_HOTREMOVE void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif #endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); } extern unsigned long __highest_present_section_nr; static inline int subsection_map_index(unsigned long pfn) { return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION; } #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); return test_bit(idx, ms->usage->subsection_map); } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { return 1; } #endif #ifndef CONFIG_HAVE_ARCH_PFN_VALID static inline int pfn_valid(unsigned long pfn) { struct mem_section *ms; if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __nr_to_section(pfn_to_section_nr(pfn)); if (!valid_section(ms)) return 0; /* * Traditionally early sections always returned pfn_valid() for * the entire section-sized span. */ return early_section(ms) || pfn_section_valid(ms, pfn); } #endif static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; return present_section(__nr_to_section(pfn_to_section_nr(pfn))); } static inline unsigned long next_present_section_nr(unsigned long section_nr) { while (++section_nr <= __highest_present_section_nr) { if (present_section_nr(section_nr)) return section_nr; } return -1; } /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate * this restriction. */ #ifdef CONFIG_NUMA #define pfn_to_nid(pfn) \ ({ \ unsigned long __pfn_to_nid_pfn = (pfn); \ page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ }) #else #define pfn_to_nid(pfn) (0) #endif void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ /* * During memory init memblocks map pfns to nids. The search is expensive and * this caches recent lookups. The implementation of __early_pfn_to_nid * may treat start/end as pfns or sections. */ struct mminit_pfnnid_cache { unsigned long last_start; unsigned long last_end; int last_nid; }; /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we * need to check pfn validity within that MAX_ORDER_NR_PAGES block. * pfn_valid_within() should be used in this case; we optimise this away * when we have no holes within a MAX_ORDER_NR_PAGES block. */ #ifdef CONFIG_HOLES_IN_ZONE #define pfn_valid_within(pfn) pfn_valid(pfn) #else #define pfn_valid_within(pfn) (1) #endif #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL /* * pfn_valid() is meant to be able to tell if a given PFN has valid memmap * associated with it or not. This means that a struct page exists for this * pfn. The caller cannot assume the page is fully initialized in general. * Hotplugable pages might not have been onlined yet. pfn_to_online_page() * will ensure the struct page is fully online and initialized. Special pages * (e.g. ZONE_DEVICE) are never onlined and should be treated accordingly. * * In FLATMEM, it is expected that holes always have valid memmap as long as * there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed * that a valid section has a memmap for the entire section. * * However, an ARM, and maybe other embedded architectures in the future * free memmap backing holes to save memory on the assumption the memmap is * never used. The page_zone linkages are then broken even though pfn_valid() * returns true. A walker of the full memmap must then do this additional * check to ensure the memmap they are looking at is sane by making sure * the zone and PFN linkages are still valid. This is expensive, but walkers * of the full memmap are extremely rare. */ bool memmap_valid_within(unsigned long pfn, struct page *page, struct zone *zone); #else static inline bool memmap_valid_within(unsigned long pfn, struct page *page, struct zone *zone) { return true; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 // SPDX-License-Identifier: GPL-2.0-only /* * fs/dcache.c * * Complete reimplementation * (C) 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ /* * Notes on the allocation strategy: * * The dcache is a master of the icache - whenever a dcache entry * exists, the inode will always exist. "iput()" is done either when * the dcache entry is deleted or garbage collected. */ #include <linux/ratelimit.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> #include <linux/security.h> #include <linux/seqlock.h> #include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/list_lru.h> #include "internal.h" #include "mount.h" /* * Usage: * dcache->d_inode->i_lock protects: * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_roots bl list spinlock protects: * - the s_roots list (see __d_drop) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags * - d_name * - d_lru * - d_count * - d_unhashed() * - d_parent and d_subdirs * - childrens' d_child and d_parent * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock * dentry->d_lock * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_roots lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock * ... * dentry->d_parent->d_lock * dentry->d_lock * * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try * to make this good - I've just made it work. * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. */ static unsigned int d_hash_shift __read_mostly; static struct hlist_bl_head *dentry_hashtable __read_mostly; static inline struct hlist_bl_head *d_hash(unsigned int hash) { return dentry_hashtable + (hash >> d_hash_shift); } #define IN_LOOKUP_SHIFT 10 static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } /* Statistics gathering. */ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* * Here we resort to our own counters instead of using generic per-cpu counters * for consistency with what the vfs inode code does. We are expected to harvest * better code and performance by having our own specialized counters. * * Please note that the loop is done over all possible CPUs, not over all online * CPUs. The reason for this is that we don't want to play games with CPUs going * on and off. If one of them goes off, we will just keep their counters. * * glommer: See cffbc8a for details, and if you ever intend to change this, * please update all vfs counters to match. */ static long get_nr_dentry(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_unused, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_negative(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_negative, i); return sum < 0 ? 0 : sum; } int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif /* * Compare 2 name strings, return 0 if they match, otherwise non-zero. * The strings are both count bytes long, and count is non-zero. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> /* * NOTE! 'cs' and 'scount' come from a dentry, so it has a * aligned allocation for this particular component. We don't * strictly need the load_unaligned_zeropad() safety, but it * doesn't hurt either. * * In contrast, 'ct' and 'tcount' can be from a pathname, and do * need the careful unaligned handling. */ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { unsigned long a,b,mask; for (;;) { a = read_word_at_a_time(cs); b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; if (unlikely(a != b)) return 1; cs += sizeof(unsigned long); ct += sizeof(unsigned long); tcount -= sizeof(unsigned long); if (!tcount) return 0; } mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } #else static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { do { if (*cs != *ct) return 1; cs++; ct++; tcount--; } while (tcount); return 0; } #endif static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { /* * Be careful about RCU walk racing with rename: * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The * RCU walk will check the sequence count eventually, * and catch it. And we won't overrun the buffer, * because we're reading the name pointer atomically, * and a dentry name is guaranteed to be properly * terminated with a NUL byte. * * End result: even if 'len' is wrong, we'll exit * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } struct external_name { union { atomic_t count; struct rcu_head head; } u; unsigned char name[]; }; static inline struct external_name *external_name(struct dentry *dentry) { return container_of(dentry->d_name.name, struct external_name, name[0]); } static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) { return dentry->d_name.name != dentry->d_iname; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { spin_lock(&dentry->d_lock); name->name = dentry->d_name; if (unlikely(dname_external(dentry))) { atomic_inc(&external_name(dentry)->u.count); } else { memcpy(name->inline_name, dentry->d_iname, dentry->d_name.len + 1); name->name.name = name->inline_name; } spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { if (unlikely(name->name.name != name->inline_name)) { struct external_name *p; p = container_of(name->name.name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) kfree_rcu(p, u.head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) { unsigned flags; dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; if (dentry->d_flags & DCACHE_LRU_LIST) this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->u.count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ if (dentry->d_flags & DCACHE_NORCU) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); } /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else iput(inode); } /* * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry * is in use - which includes both the "real" per-superblock * LRU list _and_ the DCACHE_SHRINK_LIST use. * * The DCACHE_SHRINK_LIST bit is set whenever the dentry is * on the shrink list (ie not on the superblock LRU list). * * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * * The per-cpu "nr_dentry_negative" counters are only updated * when deleted from or added to the per-superblock LRU list, not * from/to the shrink list. That is to avoid an unneeded dec/inc * pair when moving from LRU to shrink list in select_collect(). * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) static void d_lru_add(struct dentry *dentry) { D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); this_cpu_dec(nr_dentry_unused); } static void d_shrink_add(struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, 0); list_add(&dentry->d_lru, list); dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); } /* * These can only be called under the global LRU lock, ie during the * callback for freeing the LRU list. "isolate" removes it from the * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } /** * d_drop - drop a dentry * @dentry: dentry to drop * * d_drop() unhashes the entry from the parent dentry hashes, so that it won't * be found through a VFS lookup any more. Note that this is different from * deleting the dentry - d_delete will try to mark the dentry negative if * possible, giving a successful _negative_ lookup, while d_drop will * just make the cache lookup fail. * * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * * __d_drop requires dentry->d_lock * ___d_drop doesn't mark dentry as "unhashed" * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). */ static void ___d_drop(struct dentry *dentry) { struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by * d_obtain_root, which are always IS_ROOT: */ if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_roots; else b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); hlist_bl_unlock(b); } void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { ___d_drop(dentry); dentry->d_hash.pprev = NULL; write_seqcount_invalidate(&dentry->d_seq); } } EXPORT_SYMBOL(__d_drop); void d_drop(struct dentry *dentry) { spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_drop); static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) { struct dentry *next; /* * Inform d_walk() and shrink_dentry_list() that we are no longer * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; if (unlikely(list_empty(&dentry->d_child))) return; __list_del_entry(&dentry->d_child); /* * Cursors can move around the list of children. While we'd been * a normal list member, it didn't matter - ->d_child.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - * ->d_lock on parent prevents that and since a cursor has no children * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found * using the value left in its ->d_child.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * * Solution: make sure that the pointer left behind in ->d_child.next * points to something that won't be moving around. I.e. skip the * cursors. */ while (dentry->d_child.next != &parent->d_subdirs) { next = list_entry(dentry->d_child.next, struct dentry, d_child); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; dentry->d_child.next = next->d_child.next; } } static void __dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; if (!IS_ROOT(dentry)) parent = dentry->d_parent; /* * The dentry is now unrecoverably dead to the world. */ lockref_mark_dead(&dentry->d_lockref); /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ if (dentry->d_flags & DCACHE_OP_PRUNE) dentry->d_op->d_prune(dentry); if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry); } /* if it was on the hash then remove it */ __d_drop(dentry); dentry_unlist(dentry, parent); if (parent) spin_unlock(&parent->d_lock); if (dentry->d_inode) dentry_unlink_inode(dentry); else spin_unlock(&dentry->d_lock); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); spin_lock(&dentry->d_lock); if (dentry->d_flags & DCACHE_SHRINK_LIST) { dentry->d_flags |= DCACHE_MAY_FREE; can_free = false; } spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); cond_resched(); } static struct dentry *__lock_parent(struct dentry *dentry) { struct dentry *parent; rcu_read_lock(); spin_unlock(&dentry->d_lock); again: parent = READ_ONCE(dentry->d_parent); spin_lock(&parent->d_lock); /* * We can't blindly lock dentry until we are sure * that we won't violate the locking order. * Any changes of dentry->d_parent must have * been done with parent->d_lock held, so * spin_lock() above is enough of a barrier * for checking if it's still our child. */ if (unlikely(parent != dentry->d_parent)) { spin_unlock(&parent->d_lock); goto again; } rcu_read_unlock(); if (parent != dentry) spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); else parent = NULL; return parent; } static inline struct dentry *lock_parent(struct dentry *dentry) { struct dentry *parent = dentry->d_parent; if (IS_ROOT(dentry)) return NULL; if (likely(spin_trylock(&parent->d_lock))) return parent; return __lock_parent(dentry); } static inline bool retain_dentry(struct dentry *dentry) { WARN_ON(d_in_lookup(dentry)); /* Unreachable? Get rid of it */ if (unlikely(d_unhashed(dentry))) return false; if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) return false; if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { if (dentry->d_op->d_delete(dentry)) return false; } if (unlikely(dentry->d_flags & DCACHE_DONTCACHE)) return false; /* retain; LRU fodder */ dentry->d_lockref.count--; if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) d_lru_add(dentry); else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) dentry->d_flags |= DCACHE_REFERENCED; return true; } void d_mark_dontcache(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { spin_lock(&de->d_lock); de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } inode->i_state |= I_DONTCACHE; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); /* * Finish off a dentry we've decided to kill. * dentry->d_lock must be held, returns with it unlocked. * Returns dentry requiring refcount drop, or NULL if we're done. */ static struct dentry *dentry_kill(struct dentry *dentry) __releases(dentry->d_lock) { struct inode *inode = dentry->d_inode; struct dentry *parent = NULL; if (inode && unlikely(!spin_trylock(&inode->i_lock))) goto slow_positive; if (!IS_ROOT(dentry)) { parent = dentry->d_parent; if (unlikely(!spin_trylock(&parent->d_lock))) { parent = __lock_parent(dentry); if (likely(inode || !dentry->d_inode)) goto got_locks; /* negative that became positive */ if (parent) spin_unlock(&parent->d_lock); inode = dentry->d_inode; goto slow_positive; } } __dentry_kill(dentry); return parent; slow_positive: spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); parent = lock_parent(dentry); got_locks: if (unlikely(dentry->d_lockref.count != 1)) { dentry->d_lockref.count--; } else if (likely(!retain_dentry(dentry))) { __dentry_kill(dentry); return parent; } /* we are keeping it, after all */ if (inode) spin_unlock(&inode->i_lock); if (parent) spin_unlock(&parent->d_lock); spin_unlock(&dentry->d_lock); return NULL; } /* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! */ static inline bool fast_dput(struct dentry *dentry) { int ret; unsigned int d_flags; /* * If we have a d_op->d_delete() operation, we sould not * let the dentry count go to zero, so use "put_or_lock". */ if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) return lockref_put_or_lock(&dentry->d_lockref); /* * .. otherwise, we can try to just decrement the * lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); /* * If the lockref_put_return() failed due to the lock being held * by somebody else, the fast path has failed. We will need to * get the lock, and then check the count again. */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); if (dentry->d_lockref.count > 1) { dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); return true; } return false; } /* * If we weren't the last ref, we're done. */ if (ret) return true; /* * Careful, careful. The reference count went down * to zero, but we don't hold the dentry lock, so * somebody else could get it again, and do another * dput(), and we need to not race with that. * * However, there is a very special and common case * where we don't care, because there is nothing to * do: the dentry is still hashed, it does not have * a 'delete' op, and it's referenced and already on * the LRU list. * * NOTE! Since we aren't locked, these values are * not "stable". However, it is sufficient that at * some point after we dropped the reference the * dentry was hashed and the flags had the proper * value. Other dentry users may have re-gotten * a reference to the dentry and change that, but * our work is done - we can leave the dentry * around with a zero refcount. */ smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED; /* Nothing to do? Dropping the reference was all we needed? */ if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) return true; /* * Not the fast normal case? Get the lock. We've already decremented * the refcount, but we'll need to re-check the situation after * getting the lock. */ spin_lock(&dentry->d_lock); /* * Did somebody else grab a reference to it in the meantime, and * we're no longer the last user after all? Alternatively, somebody * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ if (dentry->d_lockref.count) { spin_unlock(&dentry->d_lock); return true; } /* * Re-get the reference we optimistically dropped. We hold the * lock, and we just tested that it was zero, so we can just * set it to 1. */ dentry->d_lockref.count = 1; return false; } /* * This is dput * * This is complicated by the fact that we do not want to put * dentries that are no longer on any hash chain on the unused * list: we'd much rather just get rid of them immediately. * * However, that implies that we have to traverse the dentry * tree upwards to the parents which might _also_ now be * scheduled for deletion (it may have been only waiting for * its last child to go away). * * This tail recursion is done by hand as we don't want to depend * on the compiler to always get this right (gcc generally doesn't). * Real recursion would eat up our stack space. */ /* * dput - release a dentry * @dentry: dentry to release * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and * releasing its resources. If the parent dentries were scheduled for release * they too may now get deleted. */ void dput(struct dentry *dentry) { while (dentry) { might_sleep(); rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } /* Slow case: now with the dentry lock held */ rcu_read_unlock(); if (likely(retain_dentry(dentry))) { spin_unlock(&dentry->d_lock); return; } dentry = dentry_kill(dentry); } } EXPORT_SYMBOL(dput); static void __dput_to_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { if (dentry->d_flags & DCACHE_SHRINK_LIST) { /* let the owner of the list it's on deal with it */ --dentry->d_lockref.count; } else { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); if (!--dentry->d_lockref.count) d_shrink_add(dentry, list); } } void dput_to_list(struct dentry *dentry, struct list_head *list) { rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } rcu_read_unlock(); if (!retain_dentry(dentry)) __dput_to_list(dentry, list); spin_unlock(&dentry->d_lock); } /* This must be called with d_lock held */ static inline void __dget_dlock(struct dentry *dentry) { dentry->d_lockref.count++; } static inline void __dget(struct dentry *dentry) { lockref_get(&dentry->d_lockref); } struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } repeat: /* * Don't need rcu_dereference because we re-check it was correct under * the lock. */ rcu_read_lock(); ret = dentry->d_parent; spin_lock(&ret->d_lock); if (unlikely(ret != dentry->d_parent)) { spin_unlock(&ret->d_lock); rcu_read_unlock(); goto repeat; } rcu_read_unlock(); BUG_ON(!ret->d_lockref.count); ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } EXPORT_SYMBOL(dget_parent); static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); __dget(alias); return alias; } /** * d_find_any_alias - find any alias for a given inode * @inode: inode to find an alias for * * If any aliases exist for the given inode, take and return a * reference for one of them. If no aliases exist, return %NULL. */ struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); de = __d_find_any_alias(inode); spin_unlock(&inode->i_lock); return de; } EXPORT_SYMBOL(d_find_any_alias); /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. * Notice that if inode is a directory there can be only one alias and * it can be unhashed only if it has no children, or if it is the root * of a filesystem, or if the directory was renamed and d_revalidate * was the first vfs operation to notice. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one. */ static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias; if (S_ISDIR(inode->i_mode)) return __d_find_any_alias(inode); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); } return NULL; } struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; } EXPORT_SYMBOL(d_find_alias); /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { struct dentry *dentry; restart: spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) { struct dentry *parent = lock_parent(dentry); if (likely(!dentry->d_lockref.count)) { __dentry_kill(dentry); dput(parent); goto restart; } if (parent) spin_unlock(&parent->d_lock); } spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_prune_aliases); /* * Lock a dentry from shrink list. * Called under rcu_read_lock() and dentry->d_lock; the former * guarantees that nothing we access will be freed under us. * Note that dentry is *not* protected from concurrent dentry_kill(), * d_delete(), etc. * * Return false if dentry has been disrupted or grabbed, leaving * the caller to kick it off-list. Otherwise, return true and have * that dentry's inode and parent both locked. */ static bool shrink_lock_dentry(struct dentry *dentry) { struct inode *inode; struct dentry *parent; if (dentry->d_lockref.count) return false; inode = dentry->d_inode; if (inode && unlikely(!spin_trylock(&inode->i_lock))) { spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); if (unlikely(dentry->d_lockref.count)) goto out; /* changed inode means that somebody had grabbed it */ if (unlikely(inode != dentry->d_inode)) goto out; } parent = dentry->d_parent; if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock))) return true; spin_unlock(&dentry->d_lock); spin_lock(&parent->d_lock); if (unlikely(parent != dentry->d_parent)) { spin_unlock(&parent->d_lock); spin_lock(&dentry->d_lock); goto out; } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (likely(!dentry->d_lockref.count)) return true; spin_unlock(&parent->d_lock); out: if (inode) spin_unlock(&inode->i_lock); return false; } void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { struct dentry *dentry, *parent; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); if (!shrink_lock_dentry(dentry)) { bool can_free = false; rcu_read_unlock(); d_shrink_del(dentry); if (dentry->d_lockref.count < 0) can_free = dentry->d_flags & DCACHE_MAY_FREE; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); continue; } rcu_read_unlock(); d_shrink_del(dentry); parent = dentry->d_parent; if (parent != dentry) __dput_to_list(parent, list); __dentry_kill(dentry); } } static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; /* * Referenced dentries are still in use. If they have active * counts, just remove them from the LRU. Otherwise give them * another pass through the LRU. */ if (dentry->d_lockref.count) { d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; spin_unlock(&dentry->d_lock); /* * The list move itself will be made by the common LRU code. At * this point, we've dropped the dentry->d_lock but keep the * lru lock. This is safe to do, since every list movement is * protected by the lru lock even if both locks are held. * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like * list_lru_add and list_lru_del. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * * The only exceptions to this are functions like * shrink_dentry_list, and code that first checks for the * DCACHE_SHRINK_LIST flag. Those are guaranteed to be * operating only with stack provided lists after they are * properly isolated from the main list. It is thus, always a * local access. */ return LRU_ROTATE; } d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * prune_dcache_sb - shrink the dcache * @sb: superblock * @sc: shrink control, passed to list_lru_shrink_walk() * * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * * Shrink the dcache for the specified super block. This is used to free * the dcache before unmounting a file system. */ void shrink_dcache_sb(struct super_block *sb) { do { LIST_HEAD(dispose); list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk * @D_WALK_QUIT: quit walk * @D_WALK_NORETRY: quit when retry is needed * @D_WALK_SKIP: skip this dentry and its children */ enum d_walk_ret { D_WALK_CONTINUE, D_WALK_QUIT, D_WALK_NORETRY, D_WALK_SKIP, }; /** * d_walk - walk the dentry tree * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry * * The @enter() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *)) { struct dentry *this_parent; struct list_head *next; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; again: read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; spin_lock(&this_parent->d_lock); ret = enter(data, this_parent); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: case D_WALK_SKIP: goto out_unlock; case D_WALK_NORETRY: retry = false; break; } repeat: next = this_parent->d_subdirs.next; resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; struct dentry *dentry = list_entry(tmp, struct dentry, d_child); next = tmp->next; if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ret = enter(data, dentry); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: spin_unlock(&dentry->d_lock); goto out_unlock; case D_WALK_NORETRY: retry = false; break; case D_WALK_SKIP: spin_unlock(&dentry->d_lock); continue; } if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ rcu_read_lock(); ascend: if (this_parent != parent) { struct dentry *child = this_parent; this_parent = child->d_parent; spin_unlock(&child->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ do { next = child->d_child.next; if (next == &this_parent->d_subdirs) goto ascend; child = list_entry(next, struct dentry, d_child); } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); rcu_read_unlock(); goto resume; } if (need_seqretry(&rename_lock, seq)) goto rename_retry; rcu_read_unlock(); out_unlock: spin_unlock(&this_parent->d_lock); done_seqretry(&rename_lock, seq); return; rename_retry: spin_unlock(&this_parent->d_lock); rcu_read_unlock(); BUG_ON(seq & 1); if (!retry) return; seq = 1; goto again; } struct check_mount { struct vfsmount *mnt; unsigned int mounted; }; static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; struct path path = { .mnt = info->mnt, .dentry = dentry }; if (likely(!d_mountpoint(dentry))) return D_WALK_CONTINUE; if (__path_is_mountpoint(&path)) { info->mounted = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * path_has_submounts - check for mounts over a dentry in the * current namespace. * @parent: path to check. * * Return true if the parent or its subdirectories contain * a mount point in the current namespace. */ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; read_seqlock_excl(&mount_lock); d_walk(parent->dentry, &data, path_check_mount); read_sequnlock_excl(&mount_lock); return data.mounted; } EXPORT_SYMBOL(path_has_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is * reachable (e.g. NFS can unhash a directory dentry and then the complete * subtree can become unreachable). * * Only one of d_invalidate() and d_set_mounted() must succeed. For * this reason take rename_lock and d_lock on dentry and ancestors. */ int d_set_mounted(struct dentry *dentry) { struct dentry *p; int ret = -ENOENT; write_seqlock(&rename_lock); for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { /* Need exclusion wrt. d_invalidate() */ spin_lock(&p->d_lock); if (unlikely(d_unhashed(p))) { spin_unlock(&p->d_lock); goto out; } spin_unlock(&p->d_lock); } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { ret = -EBUSY; if (!d_mountpoint(dentry)) { dentry->d_flags |= DCACHE_MOUNTED; ret = 0; } } spin_unlock(&dentry->d_lock); out: write_sequnlock(&rename_lock); return ret; } /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level * whenever the d_subdirs list is non-empty and continue * searching. * * It returns zero iff there are no unused children, * otherwise it returns the number of children moved to * the end of the unused list. This may not be the total * number of unused children, because select_parent can * drop the lock and return early due to latency * constraints. */ struct select_data { struct dentry *start; union { long found; struct dentry *victim; }; struct list_head dispose; }; static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; } else { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); if (!dentry->d_lockref.count) { d_shrink_add(dentry, &data->dispose); data->found++; } } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { if (!dentry->d_lockref.count) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } } else { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); if (!dentry->d_lockref.count) d_shrink_add(dentry, &data->dispose); } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } /** * shrink_dcache_parent - prune dcache * @parent: parent of entries to prune * * Prune the dcache to remove unused children of the parent dentry. */ void shrink_dcache_parent(struct dentry *parent) { for (;;) { struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); d_walk(parent, &data, select_collect); if (!list_empty(&data.dispose)) { shrink_dentry_list(&data.dispose); continue; } cond_resched(); if (!data.found) break; data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { struct dentry *parent; spin_lock(&data.victim->d_lock); if (!shrink_lock_dentry(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { rcu_read_unlock(); parent = data.victim->d_parent; if (parent != data.victim) __dput_to_list(parent, &data.dispose); __dentry_kill(data.victim); } } if (!list_empty(&data.dispose)) shrink_dentry_list(&data.dispose); } } EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ if (!list_empty(&dentry->d_subdirs)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); WARN_ON(1); return D_WALK_CONTINUE; } static void do_one_tree(struct dentry *dentry) { shrink_dcache_parent(dentry); d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); } /* * destroy the dentries attached to a superblock on unmounting */ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); dentry = sb->s_root; sb->s_root = NULL; do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_roots)) { dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); do_one_tree(dentry); } } static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { struct dentry **victim = _data; if (d_mountpoint(dentry)) { __dget_dlock(dentry); *victim = dentry; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) */ void d_invalidate(struct dentry *dentry) { bool had_submounts = false; spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } __d_drop(dentry); spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ if (!dentry->d_inode) return; shrink_dcache_parent(dentry); for (;;) { struct dentry *victim = NULL; d_walk(dentry, &victim, find_submount); if (!victim) { if (had_submounts) shrink_dcache_parent(dentry); return; } had_submounts = true; detach_mounts(victim); dput(victim); } } EXPORT_SYMBOL(d_invalidate); /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; int err; dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) return NULL; /* * We guarantee that the inline name is always NUL-terminated. * This way the memcpy() done by the name switching in rename * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (unlikely(!name)) { name = &slash_name; dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } atomic_set(&p->u.count, 1); dname = p->name; } else { dname = dentry->d_iname; } dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_lockref.count = 1; dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; dentry->d_op = NULL; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_child); d_set_d_op(dentry, dentry->d_sb->s_d_op); if (dentry->d_op && dentry->d_op->d_init) { err = dentry->d_op->d_init(dentry); if (err) { if (dname_external(dentry)) kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); return NULL; } } this_cpu_inc(nr_dentry); return dentry; } /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject * to concurrency here */ __dget_dlock(parent); dentry->d_parent = parent; list_add(&dentry->d_child, &parent->d_subdirs); spin_unlock(&parent->d_lock); return dentry; } EXPORT_SYMBOL(d_alloc); struct dentry *d_alloc_anon(struct super_block *sb) { return __d_alloc(sb, NULL); } EXPORT_SYMBOL(d_alloc_anon); struct dentry *d_alloc_cursor(struct dentry * parent) { struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } return dentry; } /** * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) * @sb: the superblock * @name: qstr of the name * * For a filesystem that just pins its dentries in memory and never * performs lookups at all, return an unhashed IS_ROOT dentry. * This is used for pipes, sockets et.al. - the stuff that should * never be anyone's children or parents. Unlike all other * dentries, these will not have RCU delay between dropping the * last reference and freeing them. * * The only user is alloc_file_pseudo() and that's what should * be considered a public interface. Don't use directly. */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { struct dentry *dentry = __d_alloc(sb, name); if (likely(dentry)) dentry->d_flags |= DCACHE_NORCU; return dentry; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) { struct qstr q; q.name = name; q.hash_len = hashlen_string(parent, name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) { WARN_ON_ONCE(dentry->d_op); WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_REAL)); dentry->d_op = op; if (!op) return; if (op->d_hash) dentry->d_flags |= DCACHE_OP_HASH; if (op->d_compare) dentry->d_flags |= DCACHE_OP_COMPARE; if (op->d_revalidate) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_weak_revalidate) dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; if (op->d_real) dentry->d_flags |= DCACHE_OP_REAL; } EXPORT_SYMBOL(d_set_d_op); /* * d_set_fallthru - Mark a dentry as falling through to a lower layer * @dentry - The dentry to mark * * Mark a dentry as falling through to the lower layer (as set with * d_pin_lower()). This flag may be recorded on the medium. */ void d_set_fallthru(struct dentry *dentry) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_FALLTHRU; spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_set_fallthru); static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; if (!inode) return DCACHE_MISS_TYPE; if (S_ISDIR(inode->i_mode)) { add_flags = DCACHE_DIRECTORY_TYPE; if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { if (unlikely(!inode->i_op->lookup)) add_flags = DCACHE_AUTODIR_TYPE; else inode->i_opflags |= IOP_LOOKUP; } goto type_determined; } if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { if (unlikely(inode->i_op->get_link)) { add_flags = DCACHE_SYMLINK_TYPE; goto type_determined; } inode->i_opflags |= IOP_NOFOLLOW; } if (unlikely(!S_ISREG(inode->i_mode))) add_flags = DCACHE_SPECIAL_TYPE; type_determined: if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT; return add_flags; } static void __d_instantiate(struct dentry *dentry, struct inode *inode) { unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry)); spin_lock(&dentry->d_lock); /* * Decrement negative dentry count if it was in the LRU list. */ if (dentry->d_flags & DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); spin_unlock(&dentry->d_lock); } /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete * @inode: inode to attach to this dentry * * Fill in inode information in the entry. * * This turns negative dentries into productive full members * of society. * * NOTE! This assumes that the inode count has been incremented * (or otherwise set) by the caller to indicate that it is now * in use by the dcache. */ void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_instantiate); /* * This should be equivalent to d_instantiate() + unlock_new_inode(), * with lockdep-related part of unlock_new_inode() done before * anything else. Use that instead of open-coding d_instantiate()/ * unlock_new_inode() combinations. */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; smp_mb(); wake_up_bit(&inode->i_state, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; if (root_inode) { res = d_alloc_anon(root_inode->i_sb); if (res) d_instantiate(res, root_inode); else iput(root_inode); } return res; } EXPORT_SYMBOL(d_make_root); static struct dentry *__d_instantiate_anon(struct dentry *dentry, struct inode *inode, bool disconnected) { struct dentry *res; unsigned add_flags; security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); if (res) { spin_unlock(&inode->i_lock); dput(dentry); goto out_iput; } /* attach a disconnected dentry */ add_flags = d_flags_for_inode(inode); if (disconnected) add_flags |= DCACHE_DISCONNECTED; spin_lock(&dentry->d_lock); __d_set_inode_and_type(dentry, inode, add_flags); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); if (!disconnected) { hlist_bl_lock(&dentry->d_sb->s_roots); hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots); hlist_bl_unlock(&dentry->d_sb->s_roots); } spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); return dentry; out_iput: iput(inode); return res; } struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode) { return __d_instantiate_anon(dentry, inode, true); } EXPORT_SYMBOL(d_instantiate_anon); static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { struct dentry *tmp; struct dentry *res; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); res = d_find_any_alias(inode); if (res) goto out_iput; tmp = d_alloc_anon(inode->i_sb); if (!tmp) { res = ERR_PTR(-ENOMEM); goto out_iput; } return __d_instantiate_anon(tmp, inode, disconnected); out_iput: iput(inode); return res; } /** * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain a dentry for an inode resulting from NFS filehandle conversion or * similar open by handle operations. The returned dentry may be anonymous, * or may have a full name (if the inode was already in the cache). * * When called on a directory inode, we must ensure that the inode only ever * has one dentry. If a dentry is found, that is returned instead of * allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is released. * To make it easier to use in export operations a %NULL or IS_ERR inode may * be passed in and the error will be propagated to the return value, * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_alias(struct inode *inode) { return __d_obtain_alias(inode, true); } EXPORT_SYMBOL(d_obtain_alias); /** * d_obtain_root - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain an IS_ROOT dentry for the root of a filesystem. * * We must ensure that directory inodes only ever have one dentry. If a * dentry is found, that is returned instead of allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is * released. A %NULL or IS_ERR inode may be passed in and will be the * error will be propagate to the return value, with a %NULL @inode * replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_root(struct inode *inode) { return __d_obtain_alias(inode, false); } EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name * @inode: the inode case-insensitive lookup has found * @dentry: the negative dentry that was passed to the parent's lookup func * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * * For a case-insensitive lookup match and if the the case-exact dentry * already exists in in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { struct dentry *found, *res; /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); if (found) { iput(inode); return found; } if (d_in_lookup(dentry)) { found = d_alloc_parallel(dentry->d_parent, name, dentry->d_wait); if (IS_ERR(found) || !d_in_lookup(found)) { iput(inode); return found; } } else { found = d_alloc(dentry->d_parent, name); if (!found) { iput(inode); return ERR_PTR(-ENOMEM); } } res = d_splice_alias(inode, found); if (res) { dput(found); return res; } return found; } EXPORT_SYMBOL(d_add_ci); static inline bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name) { if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { if (dentry->d_name.len != name->len) return false; return dentry_cmp(dentry, name->name, name->len) == 0; } return parent->d_op->d_compare(dentry, dentry->d_name.len, dentry->d_name.name, name) == 0; } /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry * @name: qstr of name we wish to find * @seqp: returns d_seq value at the point where the dentry was found * Returns: dentry, or NULL * * __d_lookup_rcu is the dcache lookup function for rcu-walk name * resolution (store-free path walking) design described in * Documentation/filesystems/path-lookup.txt. * * This is not to be used outside core vfs. * * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock * held, and rcu_read_lock held. The returned dentry must not be stored into * without taking d_lock and checking d_seq sequence count against @seq * returned here. * * A refcount may be taken on the found dentry with the d_rcu_to_refcount * function. * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. * * NOTE! The caller *has* to check the resulting dentry against the sequence * number we've returned before using any of the resulting dentry state! */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; const unsigned char *str = name->name; struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); struct hlist_bl_node *node; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Carefully use d_seq when comparing a candidate dentry, to avoid * races with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; seqretry: /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. * * The caller must perform a seqcount check in order * to do anything useful with the returned dentry. * * NOTE! We do a "raw" seqcount_begin here. That means that * we don't wait for the sequence count to stabilize if it * is in the middle of a sequence change. If we do the slow * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. * * Note that raw_seqcount_begin still *does* smp_rmb(), so * we are still guaranteed NUL-termination of ->d_name.name. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { int tlen; const char *tname; if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; tlen = dentry->d_name.len; tname = dentry->d_name.name; /* we want a consistent (name,len) pair */ if (read_seqcount_retry(&dentry->d_seq, seq)) { cpu_relax(); goto seqretry; } if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) continue; } else { if (dentry->d_name.hash_len != hashlen) continue; if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) continue; } *seqp = seq; return dentry; } return NULL; } /** * d_lookup - search for a dentry * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * d_lookup searches the children of the parent dentry for the name in * question. If the dentry is found its reference count is incremented and the * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) { struct dentry *dentry; unsigned seq; do { seq = read_seqbegin(&rename_lock); dentry = __d_lookup(parent, name); if (dentry) break; } while (read_seqretry(&rename_lock, seq)); return dentry; } EXPORT_SYMBOL(d_lookup); /** * __d_lookup - search for a dentry (racy) * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * __d_lookup is like d_lookup, however it may (rarely) return a * false-negative result due to unrelated rename activity. * * __d_lookup is slightly faster by avoiding rename_lock read seqlock, * however it must be used carefully, eg. with a following d_lookup in * the case of failure. * * __d_lookup callers must be commented. */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { unsigned int hash = name->hash; struct hlist_bl_head *b = d_hash(hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Take d_lock when comparing a candidate dentry, to avoid races * with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { if (dentry->d_name.hash != hash) continue; spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) goto next; if (d_unhashed(dentry)) goto next; if (!d_same_name(dentry, parent, name)) goto next; dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; next: spin_unlock(&dentry->d_lock); } rcu_read_unlock(); return found; } /** * d_hash_and_lookup - hash the qstr then search for a dentry * @dir: Directory to search in * @name: qstr of name we wish to find * * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) */ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) { /* * Check for a fs-specific hash function. Note that we must * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ name->hash = full_name_hash(dir, name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) return ERR_PTR(err); } return d_lookup(dir, name); } EXPORT_SYMBOL(d_hash_and_lookup); /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry * - unhash this dentry and free it. * * Usually, we want to just turn this into * a negative dentry, but if anybody else is * currently using the dentry or the inode * we can't do that and we fall back on removing * it from the hash queues and waiting for * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * * Turn the dentry into a negative dentry if possible, otherwise * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) { struct inode *inode = dentry->d_inode; spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); /* * Are we the only user? */ if (dentry->d_lockref.count == 1) { dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry *entry) { struct hlist_bl_head *b = d_hash(entry->d_name.hash); hlist_bl_lock(b); hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash * * Adds a dentry to the hash according to its name. */ void d_rehash(struct dentry * entry) { spin_lock(&entry->d_lock); __d_rehash(entry); spin_unlock(&entry->d_lock); } EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) return n; cpu_relax(); } } static inline void end_dir_add(struct inode *dir, unsigned n) { smp_store_release(&dir->i_dir_seq, n + 2); } static void d_wait_lookup(struct dentry *dentry) { if (d_in_lookup(dentry)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(dentry->d_wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&dentry->d_lock); schedule(); spin_lock(&dentry->d_lock); } while (d_in_lookup(dentry)); } } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, wait_queue_head_t *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); struct hlist_bl_node *node; struct dentry *new = d_alloc(parent, name); struct dentry *dentry; unsigned seq, r_seq, d_seq; if (unlikely(!new)) return ERR_PTR(-ENOMEM); retry: rcu_read_lock(); seq = smp_load_acquire(&parent->d_inode->i_dir_seq); r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } if (read_seqcount_retry(&dentry->d_seq, d_seq)) { rcu_read_unlock(); dput(dentry); goto retry; } rcu_read_unlock(); dput(new); return dentry; } if (unlikely(read_seqretry(&rename_lock, r_seq))) { rcu_read_unlock(); goto retry; } if (unlikely(seq & 1)) { rcu_read_unlock(); goto retry; } hlist_bl_lock(b); if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { hlist_bl_unlock(b); rcu_read_unlock(); goto retry; } /* * No changes for the parent since the beginning of d_lookup(). * Since all removals from the chain happen with hlist_bl_lock(), * any potential in-lookup matches are going to stay here until * we unlock the chain. All fields are stable in everything * we encounter. */ hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) continue; if (!d_same_name(dentry, parent, name)) continue; hlist_bl_unlock(b); /* now we can try to grab a reference */ if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } rcu_read_unlock(); /* * somebody is likely to be still doing lookup for it; * wait for them to finish */ spin_lock(&dentry->d_lock); d_wait_lookup(dentry); /* * it's not in-lookup anymore; in principle we should repeat * everything from dcache lookup, but it's likely to be what * d_lookup() would've found anyway. If it is, just return it; * otherwise we really have to repeat the whole thing. */ if (unlikely(dentry->d_name.hash != hash)) goto mismatch; if (unlikely(dentry->d_parent != parent)) goto mismatch; if (unlikely(d_unhashed(dentry))) goto mismatch; if (unlikely(!d_same_name(dentry, parent, name))) goto mismatch; /* OK, it *is* a hashed match; return it */ spin_unlock(&dentry->d_lock); dput(new); return dentry; } rcu_read_unlock(); /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: spin_unlock(&dentry->d_lock); dput(dentry); goto retry; } EXPORT_SYMBOL(d_alloc_parallel); void __d_lookup_done(struct dentry *dentry) { struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); wake_up_all(dentry->d_wait); dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_lru); } EXPORT_SYMBOL(__d_lookup_done); /* inode->i_lock held if inode is non-NULL */ static inline void __d_add(struct dentry *dentry, struct inode *inode) { struct inode *dir = NULL; unsigned n; spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; n = start_dir_add(dir); __d_lookup_done(dentry); } if (inode) { unsigned add_flags = d_flags_for_inode(inode); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); } __d_rehash(dentry); if (dir) end_dir_add(dir, n); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); } /** * d_add - add dentry to hash queues * @entry: dentry to add * @inode: The inode to attach to this dentry * * This adds the entry to the hash queues and initializes @inode. * The entry was actually filled in earlier during d_alloc(). */ void d_add(struct dentry *entry, struct inode *inode) { if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); } __d_add(entry, inode); } EXPORT_SYMBOL(d_add); /** * d_exact_alias - find and hash an exact unhashed alias * @entry: dentry to add * @inode: The inode to go with this dentry * * If an unhashed dentry with the same name/parent and desired * inode already exists, hash and return it. Otherwise, return * NULL. * * Parent directory should be locked. */ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) { struct dentry *alias; unsigned int hash = entry->d_name.hash; spin_lock(&inode->i_lock); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or * parent changes, because the parent inode i_mutex is held. */ if (alias->d_name.hash != hash) continue; if (alias->d_parent != entry->d_parent) continue; if (!d_same_name(alias, entry->d_parent, &entry->d_name)) continue; spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { spin_unlock(&alias->d_lock); alias = NULL; } else { __dget_dlock(alias); __d_rehash(alias); spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); return alias; } spin_unlock(&inode->i_lock); return NULL; } EXPORT_SYMBOL(d_exact_alias); static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { if (unlikely(dname_external(dentry))) { /* * Both external: swap the pointers */ swap(target->d_name.name, dentry->d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ memcpy(target->d_iname, dentry->d_name.name, dentry->d_name.len + 1); dentry->d_name.name = target->d_name.name; target->d_name.name = target->d_iname; } } else { if (unlikely(dname_external(dentry))) { /* * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); target->d_name.name = dentry->d_name.name; dentry->d_name.name = dentry->d_iname; } else { /* * Both are internal. */ unsigned int i; BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { swap(((long *) &dentry->d_iname)[i], ((long *) &target->d_iname)[i]); } } } swap(dentry->d_name.hash_len, target->d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) { struct external_name *old_name = NULL; if (unlikely(dname_external(dentry))) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->u.count); dentry->d_name = target->d_name; } else { memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); dentry->d_name.name = dentry->d_iname; dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) kfree_rcu(old_name, u.head); } /* * __d_move - move a dentry * @dentry: entry to move * @target: new dentry * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. Caller must hold * rename_lock, the i_mutex of the source and target directories, * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { struct dentry *old_parent, *p; struct inode *dir = NULL; unsigned n; WARN_ON(!dentry->d_inode); if (WARN_ON(dentry == target)) return; BUG_ON(d_ancestor(target, dentry)); old_parent = dentry->d_parent; p = d_ancestor(old_parent, target); if (IS_ROOT(dentry)) { BUG_ON(p); spin_lock(&target->d_parent->d_lock); } else if (!p) { /* target is not a descendent of dentry->d_parent */ spin_lock(&target->d_parent->d_lock); spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); } else { BUG_ON(p == dentry); spin_lock(&old_parent->d_lock); if (p != target) spin_lock_nested(&target->d_parent->d_lock, DENTRY_D_LOCK_NESTED); } spin_lock_nested(&dentry->d_lock, 2); spin_lock_nested(&target->d_lock, 3); if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); __d_lookup_done(target); } write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); /* unhash both */ if (!d_unhashed(dentry)) ___d_drop(dentry); if (!d_unhashed(target)) ___d_drop(target); /* ... and switch them in the tree */ dentry->d_parent = target->d_parent; if (!exchange) { copy_name(dentry, target); target->d_hash.pprev = NULL; dentry->d_parent->d_lockref.count++; if (dentry != old_parent) /* wasn't IS_ROOT */ WARN_ON(!--old_parent->d_lockref.count); } else { target->d_parent = old_parent; swap_names(dentry, target); list_move(&target->d_child, &target->d_parent->d_subdirs); __d_rehash(target); fsnotify_update_flags(target); } list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); __d_rehash(dentry); fsnotify_update_flags(dentry); fscrypt_handle_d_move(dentry); write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); if (dir) end_dir_add(dir, n); if (dentry->d_parent != old_parent) spin_unlock(&dentry->d_parent->d_lock); if (dentry != old_parent) spin_unlock(&old_parent->d_lock); spin_unlock(&target->d_lock); spin_unlock(&dentry->d_lock); } /* * d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. See the locking * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); /* * d_exchange - exchange two dentries * @dentry1: first dentry * @dentry2: second dentry */ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) { write_seqlock(&rename_lock); WARN_ON(!dentry1->d_inode); WARN_ON(!dentry2->d_inode); WARN_ON(IS_ROOT(dentry1)); WARN_ON(IS_ROOT(dentry2)); __d_move(dentry1, dentry2, true); write_sequnlock(&rename_lock); } /** * d_ancestor - search for an ancestor * @p1: ancestor dentry * @p2: child dentry * * Returns the ancestor dentry of p2 which is a child of p1, if p1 is * an ancestor of p2, else NULL. */ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) { struct dentry *p; for (p = p2; !IS_ROOT(p); p = p->d_parent) { if (p->d_parent == p1) return p; } return NULL; } /* * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ static int __d_unalias(struct inode *inode, struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL; struct rw_semaphore *m2 = NULL; int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) goto out_unalias; /* See lock_rename() */ if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; if (!inode_trylock_shared(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: if (m2) up_read(m2); if (m1) mutex_unlock(m1); return ret; } /** * d_splice_alias - splice a disconnected dentry into the tree if one exists * @inode: the inode which may have a disconnected dentry * @dentry: a negative dentry which we want to point to the inode. * * If inode is a directory and has an IS_ROOT alias, then d_move that in * place of the given dentry and return it, else simply d_add the inode * to the dentry and return NULL. * * If a non-IS_ROOT directory is found, the filesystem is corrupt, and * we should error out: directories can't have multiple aliases. * * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * * If a dentry was found and moved, then it is returned. Otherwise NULL * is returned. This matches the expected return value of ->lookup. * * Cluster filesystems may call this function with a negative, hashed dentry. * In that case, we know that the inode will be a regular file, and also this * will only occur during atomic_open. So we need to check for the dentry * being already hashed only in the final case. */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { if (IS_ERR(inode)) return ERR_CAST(inode); BUG_ON(!d_unhashed(dentry)); if (!inode) goto out; security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { /* The reference to new ensures it remains an alias */ spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( "VFS: Lookup of '%s' in %s %s" " would have caused loop\n", dentry->d_name.name, inode->i_sb->s_type->name, inode->i_sb->s_id); } else if (!IS_ROOT(new)) { struct dentry *old_parent = dget(new->d_parent); int err = __d_unalias(inode, dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); new = ERR_PTR(err); } dput(old_parent); } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); } iput(inode); return new; } } out: __d_add(dentry, inode); return NULL; } EXPORT_SYMBOL(d_splice_alias); /* * Test whether new_dentry is a subdirectory of old_dentry. * * Trivially implemented using the dcache structure */ /** * is_subdir - is new dentry a subdirectory of old_dentry * @new_dentry: new dentry * @old_dentry: old dentry * * Returns true if new_dentry is a subdirectory of the parent (at any depth). * Returns false otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { bool result; unsigned seq; if (new_dentry == old_dentry) return true; do { /* for restarting inner loop in case of seq retry */ seq = read_seqbegin(&rename_lock); /* * Need rcu_readlock to protect against the d_parent trashing * due to d_move */ rcu_read_lock(); if (d_ancestor(old_dentry, new_dentry)) result = true; else result = false; rcu_read_unlock(); } while (read_seqretry(&rename_lock, seq)); return result; } EXPORT_SYMBOL(is_subdir); static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { struct dentry *root = data; if (dentry != root) { if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; dentry->d_lockref.count--; } } return D_WALK_CONTINUE; } void d_genocide(struct dentry *parent) { d_walk(parent, parent, d_genocide_kill); } EXPORT_SYMBOL(d_genocide); void d_tmpfile(struct dentry *dentry, struct inode *inode) { inode_dec_link_count(inode); BUG_ON(dentry->d_name.name != dentry->d_iname || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { if (!str) return 0; dhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_EARLY | HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; } static void __init dcache_init(void) { /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature * of the dcache. */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT, d_iname); /* Hash may have been set up in dcache_init_early */ if (!hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; } /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __read_mostly; EXPORT_SYMBOL(names_cachep); void __init vfs_caches_init_early(void) { int i; for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); dcache_init_early(); inode_init_early(); } void __init vfs_caches_init(void) { names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL); dcache_init(); inode_init(); files_init(); files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_TASK_H #define _LINUX_SCHED_TASK_H /* * Interface between the scheduler and various task lifetime (fork()/exit()) * functionality: */ #include <linux/sched.h> #include <linux/uaccess.h> struct task_struct; struct rusage; union thread_union; struct css_set; /* All the bits taken by the old clone syscall. */ #define CLONE_LEGACY_FLAGS 0xffffffffULL struct kernel_clone_args { u64 flags; int __user *pidfd; int __user *child_tid; int __user *parent_tid; int exit_signal; unsigned long stack; unsigned long stack_size; unsigned long tls; pid_t *set_tid; /* Number of elements in *set_tid */ size_t set_tid_size; int cgroup; struct cgroup *cgrp; struct css_set *cset; }; /* * This serializes "schedule()" and also protects * the run-queue from deletions/modifications (but * _adding_ to the beginning of the run-queue has * a separate lock). */ extern rwlock_t tasklist_lock; extern spinlock_t mmlist_lock; extern union thread_union init_thread_union; extern struct task_struct init_task; #ifdef CONFIG_PROVE_RCU extern int lockdep_tasklist_lock_is_held(void); #endif /* #ifdef CONFIG_PROVE_RCU */ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); extern void proc_caches_init(void); extern void fork_init(void); extern void release_task(struct task_struct * p); extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *, unsigned long); extern void flush_thread(void); #ifdef CONFIG_HAVE_EXIT_THREAD extern void exit_thread(struct task_struct *tsk); #else static inline void exit_thread(struct task_struct *tsk) { } #endif extern void do_group_exit(int); extern void exit_files(struct task_struct *); extern void exit_itimers(struct signal_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); extern void free_task(struct task_struct *tsk); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP extern void sched_exec(void); #else #define sched_exec() {} #endif static inline struct task_struct *get_task_struct(struct task_struct *t) { refcount_inc(&t->usage); return t; } extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) { if (refcount_dec_and_test(&t->usage)) __put_task_struct(t); } static inline void put_task_struct_many(struct task_struct *t, int nr) { if (refcount_sub_and_test(nr, &t->usage)) __put_task_struct(t); } void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else # define arch_task_struct_size (sizeof(struct task_struct)) #endif #ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST /* * If an architecture has not declared a thread_struct whitelist we * must assume something there may need to be copied to userspace. */ static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = 0; /* Handle dynamically sized thread_struct. */ *size = arch_task_struct_size - offsetof(struct task_struct, thread); } #endif #ifdef CONFIG_VMAP_STACK static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return t->stack_vm_area; } #else static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return NULL; } #endif /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), * neither inside nor outside. */ static inline void task_lock(struct task_struct *p) { spin_lock(&p->alloc_lock); } static inline void task_unlock(struct task_struct *p) { spin_unlock(&p->alloc_lock); } #endif /* _LINUX_SCHED_TASK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 #undef TRACE_SYSTEM #define TRACE_SYSTEM qdisc #if !defined(_TRACE_QDISC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_QDISC_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> TRACE_EVENT(qdisc_dequeue, TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, int packets, struct sk_buff *skb), TP_ARGS(qdisc, txq, packets, skb), TP_STRUCT__entry( __field( struct Qdisc *, qdisc ) __field(const struct netdev_queue *, txq ) __field( int, packets ) __field( void *, skbaddr ) __field( int, ifindex ) __field( u32, handle ) __field( u32, parent ) __field( unsigned long, txq_state) ), /* skb==NULL indicate packets dequeued was 0, even when packets==1 */ TP_fast_assign( __entry->qdisc = qdisc; __entry->txq = txq; __entry->packets = skb ? packets : 0; __entry->skbaddr = skb; __entry->ifindex = txq->dev ? txq->dev->ifindex : 0; __entry->handle = qdisc->handle; __entry->parent = qdisc->parent; __entry->txq_state = txq->state; ), TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%p", __entry->ifindex, __entry->handle, __entry->parent, __entry->txq_state, __entry->packets, __entry->skbaddr ) ); TRACE_EVENT(qdisc_reset, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q) ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev, qdisc_dev(q)); __assign_str(kind, q->ops->id); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_destroy, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q) ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev, qdisc_dev(q)); __assign_str(kind, q->ops->id); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_create, TP_PROTO(const struct Qdisc_ops *ops, struct net_device *dev, u32 parent), TP_ARGS(ops, dev, parent), TP_STRUCT__entry( __string( dev, dev->name ) __string( kind, ops->id ) __field( u32, parent ) ), TP_fast_assign( __assign_str(dev, dev->name); __assign_str(kind, ops->id); __entry->parent = parent; ), TP_printk("dev=%s kind=%s parent=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent)) ); #endif /* _TRACE_QDISC_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_H #define BLK_MQ_H #include <linux/blkdev.h> #include <linux/sbitmap.h> #include <linux/srcu.h> struct blk_mq_tags; struct blk_flush_queue; /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device */ struct blk_mq_hw_ctx { struct { /** @lock: Protects the dispatch list. */ spinlock_t lock; /** * @dispatch: Used for requests that are ready to be * dispatched to the hardware but for some reason (e.g. lack of * resources) could not be sent to the hardware. As soon as the * driver can send new requests, requests at this list will * be sent first for a fairer dispatch. */ struct list_head dispatch; /** * @state: BLK_MQ_S_* flags. Defines the state of the hw * queue (active, scheduled to restart, stopped). */ unsigned long state; } ____cacheline_aligned_in_smp; /** * @run_work: Used for scheduling a hardware queue run at a later time. */ struct delayed_work run_work; /** @cpumask: Map of available CPUs where this hctx can run. */ cpumask_var_t cpumask; /** * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU * selection from @cpumask. */ int next_cpu; /** * @next_cpu_batch: Counter of how many works left in the batch before * changing to the next CPU. */ int next_cpu_batch; /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ unsigned long flags; /** * @sched_data: Pointer owned by the IO scheduler attached to a request * queue. It's up to the IO scheduler how to use this pointer. */ void *sched_data; /** * @queue: Pointer to the request queue that owns this hardware context. */ struct request_queue *queue; /** @fq: Queue of requests that need to perform a flush operation. */ struct blk_flush_queue *fq; /** * @driver_data: Pointer to data owned by the block driver that created * this hctx */ void *driver_data; /** * @ctx_map: Bitmap for each software queue. If bit is on, there is a * pending request in that software queue. */ struct sbitmap ctx_map; /** * @dispatch_from: Software queue to be used when no scheduler was * selected. */ struct blk_mq_ctx *dispatch_from; /** * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to * decide if the hw_queue is busy using Exponential Weighted Moving * Average algorithm. */ unsigned int dispatch_busy; /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ unsigned short type; /** @nr_ctx: Number of software queues. */ unsigned short nr_ctx; /** @ctxs: Array of software queues. */ struct blk_mq_ctx **ctxs; /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ spinlock_t dispatch_wait_lock; /** * @dispatch_wait: Waitqueue to put requests when there is no tag * available at the moment, to wait for another try in the future. */ wait_queue_entry_t dispatch_wait; /** * @wait_index: Index of next available dispatch_wait queue to insert * requests. */ atomic_t wait_index; /** * @tags: Tags owned by the block driver. A tag at this set is only * assigned when a request is dispatched from a hardware queue. */ struct blk_mq_tags *tags; /** * @sched_tags: Tags owned by I/O scheduler. If there is an I/O * scheduler associated with a request queue, a tag is assigned when * that request is allocated. Else, this member is not used. */ struct blk_mq_tags *sched_tags; /** @queued: Number of queued requests. */ unsigned long queued; /** @run: Number of dispatched requests. */ unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 /** @dispatched: Number of dispatch requests by queue. */ unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; /** @queue_num: Index of this hardware queue. */ unsigned int queue_num; /** * @nr_active: Number of active requests. Only used when a tag set is * shared across request queues. */ atomic_t nr_active; /** * @elevator_queued: Number of queued requests on hctx. */ atomic_t elevator_queued; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; /** @cpuhp_dead: List to store request if some CPU die. */ struct hlist_node cpuhp_dead; /** @kobj: Kernel object for sysfs. */ struct kobject kobj; /** @poll_considered: Count times blk_poll() was called. */ unsigned long poll_considered; /** @poll_invoked: Count how many requests blk_poll() polled. */ unsigned long poll_invoked; /** @poll_success: Count how many polled requests were completed. */ unsigned long poll_success; #ifdef CONFIG_BLK_DEBUG_FS /** * @debugfs_dir: debugfs directory for this hardware queue. Named * as cpu<cpu_number>. */ struct dentry *debugfs_dir; /** @sched_debugfs_dir: debugfs directory for the scheduler. */ struct dentry *sched_debugfs_dir; #endif /** * @hctx_list: if this hctx is not in use, this is an entry in * q->unused_hctx_list. */ struct list_head hctx_list; /** * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also * blk_mq_hw_ctx_size(). */ struct srcu_struct srcu[]; }; /** * struct blk_mq_queue_map - Map software queues to hardware queues * @mq_map: CPU ID to hardware queue index map. This is an array * with nr_cpu_ids elements. Each element has a value in the range * [@queue_offset, @queue_offset + @nr_queues). * @nr_queues: Number of hardware queues to map CPU IDs onto. * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe * driver to map each hardware queue type (enum hctx_type) onto a distinct * set of hardware queues. */ struct blk_mq_queue_map { unsigned int *mq_map; unsigned int nr_queues; unsigned int queue_offset; }; /** * enum hctx_type - Type of hardware queue * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. * @HCTX_TYPE_READ: Just for READ I/O. * @HCTX_TYPE_POLL: Polled I/O of any kind. * @HCTX_MAX_TYPES: Number of types of hctx. */ enum hctx_type { HCTX_TYPE_DEFAULT, HCTX_TYPE_READ, HCTX_TYPE_POLL, HCTX_MAX_TYPES, }; /** * struct blk_mq_tag_set - tag set that can be shared between request queues * @map: One or more ctx -> hctx mappings. One map exists for each * hardware queue type (enum hctx_type) that the driver wishes * to support. There are no restrictions on maps being of the * same size, and it's perfectly legal to share maps between * types. * @nr_maps: Number of elements in the @map array. A number in the range * [1, HCTX_MAX_TYPES]. * @ops: Pointers to functions that implement block driver behavior. * @nr_hw_queues: Number of hardware queues supported by the block driver that * owns this data structure. * @queue_depth: Number of tags per hardware queue, reserved tags included. * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag * allocations. * @cmd_size: Number of additional bytes to allocate per request. The block * driver owns these additional bytes. * @numa_node: NUMA node the storage adapter has been connected to. * @timeout: Request processing timeout in jiffies. * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. * @active_queues_shared_sbitmap: * number of active request queues per tag set. * @__bitmap_tags: A shared tags sbitmap, used over all hctx's * @__breserved_tags: * A shared reserved tags sbitmap, used over all hctx's * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. */ struct blk_mq_tag_set { struct blk_mq_queue_map map[HCTX_MAX_TYPES]; unsigned int nr_maps; const struct blk_mq_ops *ops; unsigned int nr_hw_queues; unsigned int queue_depth; unsigned int reserved_tags; unsigned int cmd_size; int numa_node; unsigned int timeout; unsigned int flags; void *driver_data; atomic_t active_queues_shared_sbitmap; struct sbitmap_queue __bitmap_tags; struct sbitmap_queue __breserved_tags; struct blk_mq_tags **tags; struct mutex tag_list_lock; struct list_head tag_list; }; /** * struct blk_mq_queue_data - Data about a request inserted in a queue * * @rq: Request pointer. * @last: If it is the last request in the queue. */ struct blk_mq_queue_data { struct request *rq; bool last; }; typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); /** * struct blk_mq_ops - Callback functions that implements block driver * behaviour. */ struct blk_mq_ops { /** * @queue_rq: Queue a new request from block IO. */ blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); /** * @commit_rqs: If a driver uses bd->last to judge when to submit * requests to hardware, it must define this function. In case of errors * that make us stop issuing further requests, this hook serves the * purpose of kicking the hardware (which the last request otherwise * would have done). */ void (*commit_rqs)(struct blk_mq_hw_ctx *); /** * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the * reserved budget. Also we have to handle failure case * of .get_budget for avoiding I/O deadlock. */ bool (*get_budget)(struct request_queue *); /** * @put_budget: Release the reserved budget. */ void (*put_budget)(struct request_queue *); /** * @timeout: Called on request timeout. */ enum blk_eh_timer_return (*timeout)(struct request *, bool); /** * @poll: Called to poll for completion of a specific tag. */ int (*poll)(struct blk_mq_hw_ctx *); /** * @complete: Mark the request as complete. */ void (*complete)(struct request *); /** * @init_hctx: Called when the block layer side of a hardware queue has * been set up, allowing the driver to allocate/init matching * structures. */ int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); /** * @exit_hctx: Ditto for exit/teardown. */ void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); /** * @init_request: Called for every command allocated by the block layer * to allow the driver to set up driver specific data. * * Tag greater than or equal to queue_depth is for setting up * flush request. */ int (*init_request)(struct blk_mq_tag_set *set, struct request *, unsigned int, unsigned int); /** * @exit_request: Ditto for exit/teardown. */ void (*exit_request)(struct blk_mq_tag_set *set, struct request *, unsigned int); /** * @initialize_rq_fn: Called from inside blk_get_request(). */ void (*initialize_rq_fn)(struct request *rq); /** * @cleanup_rq: Called before freeing one request which isn't completed * yet, and usually for freeing the driver private data. */ void (*cleanup_rq)(struct request *); /** * @busy: If set, returns whether or not this queue currently is busy. */ bool (*busy)(struct request_queue *); /** * @map_queues: This allows drivers specify their own queue mapping by * overriding the setup-time function that builds the mq_map. */ int (*map_queues)(struct blk_mq_tag_set *set); #ifdef CONFIG_BLK_DEBUG_FS /** * @show_rq: Used by the debugfs implementation to show driver-specific * information about a request. */ void (*show_rq)(struct seq_file *m, struct request *rq); #endif }; enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, /* * Set when this device requires underlying blk-mq device for * completing IO: */ BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, /* hw queue is inactive after all its CPUs become offline */ BLK_MQ_S_INACTIVE = 3, BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_CPU_WORK_BATCH = 8, }; #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata); struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q, bool elevator_init); struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags); void blk_mq_unregister_dev(struct device *, struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_free_request(struct request *rq); bool blk_mq_queue_inflight(struct request_queue *q); enum { /* return when out of requests */ BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), /* allocate from reserved pool */ BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), /* set RQF_PM */ BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), }; struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); enum { BLK_MQ_UNIQUE_TAG_BITS = 16, BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, }; u32 blk_mq_unique_tag(struct request *rq); static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) { return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; } static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) { return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; } /** * blk_mq_rq_state() - read the current MQ_RQ_* state of a request * @rq: target request. */ static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) { return READ_ONCE(rq->state); } static inline int blk_mq_request_started(struct request *rq) { return blk_mq_rq_state(rq) != MQ_RQ_IDLE; } static inline int blk_mq_request_completed(struct request *rq) { return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; } void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_complete_request(struct request *rq); bool blk_mq_complete_request_remote(struct request *rq); bool blk_mq_queue_stopped(struct request_queue *q); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); int blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); unsigned int blk_mq_rq_cpu(struct request *rq); bool __blk_should_fake_timeout(struct request_queue *q); static inline bool blk_should_fake_timeout(struct request_queue *q) { if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) return __blk_should_fake_timeout(q); return false; } /** * blk_mq_rq_from_pdu - cast a PDU to a request * @pdu: the PDU (Protocol Data Unit) to be casted * * Return: request * * Driver command data is immediately after the request. So subtract request * size to get back to the original request. */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { return pdu - sizeof(struct request); } /** * blk_mq_rq_to_pdu - cast a request to a PDU * @rq: the request to be casted * * Return: pointer to the PDU * * Driver command data is immediately after the request. So add request to get * the PDU. */ static inline void *blk_mq_rq_to_pdu(struct request *rq) { return rq + 1; } #define queue_for_each_hw_ctx(q, hctx, i) \ for ((i) = 0; (i) < (q)->nr_hw_queues && \ ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) { if (rq->tag != -1) return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) | BLK_QC_T_INTERNAL; } static inline void blk_mq_cleanup_rq(struct request *rq) { if (rq->q->mq_ops->cleanup_rq) rq->q->mq_ops->cleanup_rq(rq); } blk_qc_t blk_mq_submit_bio(struct bio *bio); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H #include <linux/huge_mm.h> #include <linux/swap.h> /** * page_is_file_lru - should the page be on a file LRU or anon LRU? * @page: the page to test * * Returns 1 if @page is a regular filesystem backed page cache page or a lazily * freed anonymous page (e.g. via MADV_FREE). Returns 0 if @page is a normal * anonymous page, a tmpfs page or otherwise ram or swap backed page. Used by * functions that manipulate the LRU lists, to sort a page onto the right LRU * list. * * We would like to get this info without a page flag, but the state * needs to survive until the page is last deleted from the LRU, which * could be as far down as __page_cache_release. */ static inline int page_is_file_lru(struct page *page) { return !PageSwapBacked(page); } static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, int nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, int nr_pages) { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); } static __always_inline void add_page_to_lru_list_tail(struct page *page, struct lruvec *lruvec, enum lru_list lru) { update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add_tail(&page->lru, &lruvec->lists[lru]); } static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { list_del(&page->lru); update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page)); } /** * page_lru_base_type - which LRU list type should a page be on? * @page: the page to test * * Used for LRU list index arithmetic. * * Returns the base LRU type - file or anon - @page should be on. */ static inline enum lru_list page_lru_base_type(struct page *page) { if (page_is_file_lru(page)) return LRU_INACTIVE_FILE; return LRU_INACTIVE_ANON; } /** * page_off_lru - which LRU list was page on? clearing its lru flags. * @page: the page to test * * Returns the LRU list a page was on, as an index into the array of LRU * lists; and clears its Unevictable or Active flags, ready for freeing. */ static __always_inline enum lru_list page_off_lru(struct page *page) { enum lru_list lru; if (PageUnevictable(page)) { __ClearPageUnevictable(page); lru = LRU_UNEVICTABLE; } else { lru = page_lru_base_type(page); if (PageActive(page)) { __ClearPageActive(page); lru += LRU_ACTIVE; } } return lru; } /** * page_lru - which LRU list should a page be on? * @page: the page to test * * Returns the LRU list a page should be on, as an index * into the array of LRU lists. */ static __always_inline enum lru_list page_lru(struct page *page) { enum lru_list lru; if (PageUnevictable(page)) lru = LRU_UNEVICTABLE; else { lru = page_lru_base_type(page); if (PageActive(page)) lru += LRU_ACTIVE; } return lru; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _ASM_X86_INSN_H #define _ASM_X86_INSN_H /* * x86 instruction analysis * * Copyright (C) IBM Corporation, 2009 */ /* insn_attr_t is defined in inat.h */ #include <asm/inat.h> struct insn_field { union { insn_value_t value; insn_byte_t bytes[4]; }; /* !0 if we've run insn_get_xxx() for this field */ unsigned char got; unsigned char nbytes; }; struct insn { struct insn_field prefixes; /* * Prefixes * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ struct insn_field vex_prefix; /* VEX prefix */ struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 * opcode.bytes[2]: opcode3 */ struct insn_field modrm; struct insn_field sib; struct insn_field displacement; union { struct insn_field immediate; struct insn_field moffset1; /* for 64bit MOV */ struct insn_field immediate1; /* for 64bit imm or off16/32 */ }; union { struct insn_field moffset2; /* for 64bit MOV */ struct insn_field immediate2; /* for 64bit imm or seg16 */ }; int emulate_prefix_size; insn_attr_t attr; unsigned char opnd_bytes; unsigned char addr_bytes; unsigned char length; unsigned char x86_64; const insn_byte_t *kaddr; /* kernel address of insn to analyze */ const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ const insn_byte_t *next_byte; }; #define MAX_INSN_SIZE 15 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) #define X86_MODRM_RM(modrm) ((modrm) & 0x07) #define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6) #define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) #define X86_SIB_BASE(sib) ((sib) & 0x07) #define X86_REX_W(rex) ((rex) & 8) #define X86_REX_R(rex) ((rex) & 4) #define X86_REX_X(rex) ((rex) & 2) #define X86_REX_B(rex) ((rex) & 1) /* VEX bit flags */ #define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ #define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */ #define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */ #define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */ #define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */ /* VEX bit fields */ #define X86_EVEX_M(vex) ((vex) & 0x03) /* EVEX Byte1 */ #define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */ #define X86_VEX2_M 1 /* VEX2.M always 1 */ #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern void insn_get_prefixes(struct insn *insn); extern void insn_get_opcode(struct insn *insn); extern void insn_get_modrm(struct insn *insn); extern void insn_get_sib(struct insn *insn); extern void insn_get_displacement(struct insn *insn); extern void insn_get_immediate(struct insn *insn); extern void insn_get_length(struct insn *insn); /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) { insn_get_modrm(insn); } /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); /* Init insn for kernel text */ static inline void kernel_insn_init(struct insn *insn, const void *kaddr, int buf_len) { #ifdef CONFIG_X86_64 insn_init(insn, kaddr, buf_len, 1); #else /* CONFIG_X86_32 */ insn_init(insn, kaddr, buf_len, 0); #endif } static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.value != 0); } static inline int insn_is_evex(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.nbytes == 4); } static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; } /* Ensure this instruction is decoded completely */ static inline int insn_complete(struct insn *insn) { return insn->opcode.got && insn->modrm.got && insn->sib.got && insn->displacement.got && insn->immediate.got; } static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX2_M; else if (insn->vex_prefix.nbytes == 3) /* 3 bytes VEX */ return X86_VEX3_M(insn->vex_prefix.bytes[1]); else /* EVEX */ return X86_EVEX_M(insn->vex_prefix.bytes[1]); } static inline insn_byte_t insn_vex_p_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX_P(insn->vex_prefix.bytes[1]); else return X86_VEX_P(insn->vex_prefix.bytes[2]); } /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { if (insn_is_avx(insn)) return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); return 0; } /* Offset of each field from kaddr */ static inline int insn_offset_rex_prefix(struct insn *insn) { return insn->prefixes.nbytes; } static inline int insn_offset_vex_prefix(struct insn *insn) { return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes; } static inline int insn_offset_opcode(struct insn *insn) { return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes; } static inline int insn_offset_modrm(struct insn *insn) { return insn_offset_opcode(insn) + insn->opcode.nbytes; } static inline int insn_offset_sib(struct insn *insn) { return insn_offset_modrm(insn) + insn->modrm.nbytes; } static inline int insn_offset_displacement(struct insn *insn) { return insn_offset_sib(insn) + insn->sib.nbytes; } static inline int insn_offset_immediate(struct insn *insn) { return insn_offset_displacement(insn) + insn->displacement.nbytes; } /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. * @idx: Index storage. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix * and the index is stored in @idx (note that this @idx is just for a cursor, * do not change it.) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ #define for_each_insn_prefix(insn, idx, prefix) \ for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e /* * Intel SDM Vol.3A 6.8.3 states; * "Any single-step trap that would be delivered following the MOV to SS * instruction or POP to SS instruction (because EFLAGS.TF is 1) is * suppressed." * This function returns true if @insn is MOV SS or POP SS. On these * instructions, single stepping is suppressed. */ static inline int insn_masking_exception(struct insn *insn) { return insn->opcode.bytes[0] == POP_SS_OPCODE || (insn->opcode.bytes[0] == MOV_SREG_OPCODE && X86_MODRM_REG(insn->modrm.bytes[0]) == 2); } #endif /* _ASM_X86_INSN_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* audit.h -- Auditing support * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> */ #ifndef _LINUX_AUDIT_H_ #define _LINUX_AUDIT_H_ #include <linux/sched.h> #include <linux/ptrace.h> #include <uapi/linux/audit.h> #include <uapi/linux/netfilter/nf_tables.h> #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) struct audit_sig_info { uid_t uid; pid_t pid; char ctx[]; }; struct audit_buffer; struct audit_context; struct inode; struct netlink_skb_parms; struct path; struct linux_binprm; struct mq_attr; struct mqstat; struct audit_watch; struct audit_tree; struct sk_buff; struct audit_krule { u32 pflags; u32 flags; u32 listnr; u32 action; u32 mask[AUDIT_BITMASK_SIZE]; u32 buflen; /* for data alloc on list rules */ u32 field_count; char *filterkey; /* ties events to rules */ struct audit_field *fields; struct audit_field *arch_f; /* quick access to arch field */ struct audit_field *inode_f; /* quick access to an inode field */ struct audit_watch *watch; /* associated watch */ struct audit_tree *tree; /* associated watched tree */ struct audit_fsnotify_mark *exe; struct list_head rlist; /* entry in audit_{watch,tree}.rules list */ struct list_head list; /* for AUDIT_LIST* purposes only */ u64 prio; }; /* Flag to indicate legacy AUDIT_LOGINUID unset usage */ #define AUDIT_LOGINUID_LEGACY 0x1 struct audit_field { u32 type; union { u32 val; kuid_t uid; kgid_t gid; struct { char *lsm_str; void *lsm_rule; }; }; u32 op; }; enum audit_ntp_type { AUDIT_NTP_OFFSET, AUDIT_NTP_FREQ, AUDIT_NTP_STATUS, AUDIT_NTP_TAI, AUDIT_NTP_TICK, AUDIT_NTP_ADJUST, AUDIT_NTP_NVALS /* count */ }; #ifdef CONFIG_AUDITSYSCALL struct audit_ntp_val { long long oldval, newval; }; struct audit_ntp_data { struct audit_ntp_val vals[AUDIT_NTP_NVALS]; }; #else struct audit_ntp_data {}; #endif enum audit_nfcfgop { AUDIT_XT_OP_REGISTER, AUDIT_XT_OP_REPLACE, AUDIT_XT_OP_UNREGISTER, AUDIT_NFT_OP_TABLE_REGISTER, AUDIT_NFT_OP_TABLE_UNREGISTER, AUDIT_NFT_OP_CHAIN_REGISTER, AUDIT_NFT_OP_CHAIN_UNREGISTER, AUDIT_NFT_OP_RULE_REGISTER, AUDIT_NFT_OP_RULE_UNREGISTER, AUDIT_NFT_OP_SET_REGISTER, AUDIT_NFT_OP_SET_UNREGISTER, AUDIT_NFT_OP_SETELEM_REGISTER, AUDIT_NFT_OP_SETELEM_UNREGISTER, AUDIT_NFT_OP_GEN_REGISTER, AUDIT_NFT_OP_OBJ_REGISTER, AUDIT_NFT_OP_OBJ_UNREGISTER, AUDIT_NFT_OP_OBJ_RESET, AUDIT_NFT_OP_FLOWTABLE_REGISTER, AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, AUDIT_NFT_OP_INVALID, }; extern int is_audit_feature_set(int which); extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); /* only for compat system calls */ extern unsigned compat_write_class[]; extern unsigned compat_read_class[]; extern unsigned compat_dir_class[]; extern unsigned compat_chattr_class[]; extern unsigned compat_signal_class[]; extern int audit_classify_compat_syscall(int abi, unsigned syscall); /* audit_names->type values */ #define AUDIT_TYPE_UNKNOWN 0 /* we don't know yet */ #define AUDIT_TYPE_NORMAL 1 /* a "normal" audit record */ #define AUDIT_TYPE_PARENT 2 /* a parent audit record */ #define AUDIT_TYPE_CHILD_DELETE 3 /* a child being deleted */ #define AUDIT_TYPE_CHILD_CREATE 4 /* a child being created */ /* maximized args number that audit_socketcall can process */ #define AUDITSC_ARGS 6 /* bit values for ->signal->audit_tty */ #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) struct filename; #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 #ifdef CONFIG_AUDIT /* These are defined in audit.c */ /* Public API */ extern __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...); extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type); extern __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...); extern void audit_log_end(struct audit_buffer *ab); extern bool audit_string_contains_control(const char *string, size_t len); extern void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len); extern void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n); extern void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n); extern void audit_log_untrustedstring(struct audit_buffer *ab, const char *string); extern void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path); extern void audit_log_key(struct audit_buffer *ab, char *key); extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); extern int audit_update_lsm_rules(void); /* Private API (for audit.c only) */ extern int audit_rule_change(int type, int seq, void *data, size_t datasz); extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); extern int audit_set_loginuid(kuid_t loginuid); static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return tsk->loginuid; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return tsk->sessionid; } extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { } static inline struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { return NULL; } static inline __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { } static inline void audit_log_end(struct audit_buffer *ab) { } static inline void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { } static inline void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n) { } static inline void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n) { } static inline void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { } static inline void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { } static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; } static inline void audit_log_task_info(struct audit_buffer *ab) { } static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return INVALID_UID; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return AUDIT_SID_UNSET; } #define audit_enabled AUDIT_OFF static inline int audit_signal_info(int sig, struct task_struct *t) { return 0; } #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC #define audit_is_compat(arch) (!((arch) & __AUDIT_ARCH_64BIT)) #else #define audit_is_compat(arch) false #endif #define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ #define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ #define AUDIT_INODE_NOEVAL 4 /* audit record incomplete */ #ifdef CONFIG_AUDITSYSCALL #include <asm/syscall.h> /* for syscall_get_arch() */ /* These are defined in auditsc.c */ /* Public API */ extern int audit_alloc(struct task_struct *task); extern void __audit_free(struct task_struct *task); extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); extern void __audit_getcwd(void); extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); extern void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type); extern void audit_seccomp(unsigned long syscall, long signr, int code); extern void audit_seccomp_actions_logged(const char *names, const char *old_names, int res); extern void __audit_ptrace(struct task_struct *t); static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { task->audit_context = ctx; } static inline struct audit_context *audit_context(void) { return current->audit_context; } static inline bool audit_dummy_context(void) { void *p = audit_context(); return !p || *(int *)p; } static inline void audit_free(struct task_struct *task) { if (unlikely(task->audit_context)) __audit_free(task); } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { if (unlikely(audit_context())) __audit_syscall_entry(major, a0, a1, a2, a3); } static inline void audit_syscall_exit(void *pt_regs) { if (unlikely(audit_context())) { int success = is_syscall_success(pt_regs); long return_code = regs_return_value(pt_regs); __audit_syscall_exit(success, return_code); } } static inline struct filename *audit_reusename(const __user char *name) { if (unlikely(!audit_dummy_context())) return __audit_reusename(name); return NULL; } static inline void audit_getname(struct filename *name) { if (unlikely(!audit_dummy_context())) __audit_getname(name); } static inline void audit_getcwd(void) { if (unlikely(audit_context())) __audit_getcwd(); } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, aflags); } static inline void audit_file(struct file *file) { if (unlikely(!audit_dummy_context())) __audit_file(file); } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { if (unlikely(!audit_dummy_context())) __audit_inode_child(parent, dentry, type); } void audit_core_dumps(long signr); static inline void audit_ptrace(struct task_struct *t) { if (unlikely(!audit_dummy_context())) __audit_ptrace(t); } /* Private API (for audit.c only) */ extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); extern void __audit_bprm(struct linux_binprm *bprm); extern int __audit_socketcall(int nargs, unsigned long *args); extern int __audit_sockaddr(int len, void *addr); extern void __audit_fd_pair(int fd1, int fd2); extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr); extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout); extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification); extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old); extern void __audit_log_capset(const struct cred *new, const struct cred *old); extern void __audit_mmap_fd(int fd, int flags); extern void __audit_log_kern_module(char *name); extern void __audit_fanotify(unsigned int response); extern void __audit_tk_injoffset(struct timespec64 offset); extern void __audit_ntp_log(const struct audit_ntp_data *ad); extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp); static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { if (unlikely(!audit_dummy_context())) __audit_ipc_obj(ipcp); } static inline void audit_fd_pair(int fd1, int fd2) { if (unlikely(!audit_dummy_context())) __audit_fd_pair(fd1, fd2); } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } static inline void audit_bprm(struct linux_binprm *bprm) { if (unlikely(!audit_dummy_context())) __audit_bprm(bprm); } static inline int audit_socketcall(int nargs, unsigned long *args) { if (unlikely(!audit_dummy_context())) return __audit_socketcall(nargs, args); return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { unsigned long a[AUDITSC_ARGS]; int i; if (audit_dummy_context()) return 0; for (i = 0; i < nargs; i++) a[i] = (unsigned long)args[i]; return __audit_socketcall(nargs, a); } static inline int audit_sockaddr(int len, void *addr) { if (unlikely(!audit_dummy_context())) return __audit_sockaddr(len, addr); return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { if (unlikely(!audit_dummy_context())) __audit_mq_open(oflag, mode, attr); } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { if (unlikely(!audit_dummy_context())) __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout); } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { if (unlikely(!audit_dummy_context())) __audit_mq_notify(mqdes, notification); } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { if (unlikely(!audit_dummy_context())) __audit_mq_getsetattr(mqdes, mqstat); } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) return __audit_log_bprm_fcaps(bprm, new, old); return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) __audit_log_capset(new, old); } static inline void audit_mmap_fd(int fd, int flags) { if (unlikely(!audit_dummy_context())) __audit_mmap_fd(fd, flags); } static inline void audit_log_kern_module(char *name) { if (!audit_dummy_context()) __audit_log_kern_module(name); } static inline void audit_fanotify(unsigned int response) { if (!audit_dummy_context()) __audit_fanotify(response); } static inline void audit_tk_injoffset(struct timespec64 offset) { /* ignore no-op events */ if (offset.tv_sec == 0 && offset.tv_nsec == 0) return; if (!audit_dummy_context()) __audit_tk_injoffset(offset); } static inline void audit_ntp_init(struct audit_ntp_data *ad) { memset(ad, 0, sizeof(*ad)); } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].oldval = val; } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].newval = val; } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { if (!audit_dummy_context()) __audit_ntp_log(ad); } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { if (audit_enabled) __audit_log_nfcfg(name, af, nentries, op, gfp); } extern int audit_n_rules; extern int audit_signals; #else /* CONFIG_AUDITSYSCALL */ static inline int audit_alloc(struct task_struct *task) { return 0; } static inline void audit_free(struct task_struct *task) { } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { } static inline void audit_syscall_exit(void *pt_regs) { } static inline bool audit_dummy_context(void) { return true; } static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { } static inline struct audit_context *audit_context(void) { return NULL; } static inline struct filename *audit_reusename(const __user char *name) { return NULL; } static inline void audit_getname(struct filename *name) { } static inline void audit_getcwd(void) { } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { } static inline void audit_file(struct file *file) { } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { } static inline void audit_core_dumps(long signr) { } static inline void audit_seccomp(unsigned long syscall, long signr, int code) { } static inline void audit_seccomp_actions_logged(const char *names, const char *old_names, int res) { } static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { } static inline void audit_bprm(struct linux_binprm *bprm) { } static inline int audit_socketcall(int nargs, unsigned long *args) { return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { return 0; } static inline void audit_fd_pair(int fd1, int fd2) { } static inline int audit_sockaddr(int len, void *addr) { return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { } static inline void audit_mmap_fd(int fd, int flags) { } static inline void audit_log_kern_module(char *name) { } static inline void audit_fanotify(unsigned int response) { } static inline void audit_tk_injoffset(struct timespec64 offset) { } static inline void audit_ntp_init(struct audit_ntp_data *ad) { } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { } static inline void audit_ptrace(struct task_struct *t) { } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { } #define audit_n_rules 0 #define audit_signals 0 #endif /* CONFIG_AUDITSYSCALL */ static inline bool audit_loginuid_set(struct task_struct *tsk) { return uid_valid(audit_get_loginuid(tsk)); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> #include <linux/ptrace.h> #include <asm/cpu_entry_area.h> #include <asm/switch_to.h> enum stack_type { STACK_TYPE_UNKNOWN, STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; struct stack_info { enum stack_type type; unsigned long *begin, *end, *next_sp; }; bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info); const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { void *begin = info->begin; void *end = info->end; return (info->type != STACK_TYPE_UNKNOWN && addr >= begin && addr < end && addr + len > begin && addr + len <= end); } #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else #define STACKSLOTS_PER_LINE 4 #endif #ifdef CONFIG_FRAME_POINTER static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->bp; if (task == current) return __builtin_frame_address(0); return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { return NULL; } #endif /* CONFIG_FRAME_POINTER */ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->sp; if (task == current) return __builtin_frame_address(0); return (unsigned long *)task->thread.sp; } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, const char *log_lvl); /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; struct stack_frame_ia32 { u32 next_frame; u32 return_address; }; void show_opcodes(struct pt_regs *regs, const char *loglvl); void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * 25-Jul-1998 Major changes to allow for ip chain table * * 3-Jan-2000 Named tables to allow packet selection for different uses. */ /* * Format of an IP firewall descriptor * * src, dst, src_mask, dst_mask are always stored in network byte order. * flags are stored in host byte order (of course). * Port numbers are stored in HOST byte order. */ #ifndef _UAPI_IPTABLES_H #define _UAPI_IPTABLES_H #include <linux/types.h> #include <linux/compiler.h> #include <linux/if.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> #ifndef __KERNEL__ #define IPT_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN #define IPT_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN #define ipt_match xt_match #define ipt_target xt_target #define ipt_table xt_table #define ipt_get_revision xt_get_revision #define ipt_entry_match xt_entry_match #define ipt_entry_target xt_entry_target #define ipt_standard_target xt_standard_target #define ipt_error_target xt_error_target #define ipt_counters xt_counters #define IPT_CONTINUE XT_CONTINUE #define IPT_RETURN XT_RETURN /* This group is older than old (iptables < v1.4.0-rc1~89) */ #include <linux/netfilter/xt_tcpudp.h> #define ipt_udp xt_udp #define ipt_tcp xt_tcp #define IPT_TCP_INV_SRCPT XT_TCP_INV_SRCPT #define IPT_TCP_INV_DSTPT XT_TCP_INV_DSTPT #define IPT_TCP_INV_FLAGS XT_TCP_INV_FLAGS #define IPT_TCP_INV_OPTION XT_TCP_INV_OPTION #define IPT_TCP_INV_MASK XT_TCP_INV_MASK #define IPT_UDP_INV_SRCPT XT_UDP_INV_SRCPT #define IPT_UDP_INV_DSTPT XT_UDP_INV_DSTPT #define IPT_UDP_INV_MASK XT_UDP_INV_MASK /* The argument to IPT_SO_ADD_COUNTERS. */ #define ipt_counters_info xt_counters_info /* Standard return verdict, or do jump. */ #define IPT_STANDARD_TARGET XT_STANDARD_TARGET /* Error verdict. */ #define IPT_ERROR_TARGET XT_ERROR_TARGET /* fn returns 0 to continue iteration */ #define IPT_MATCH_ITERATE(e, fn, args...) \ XT_MATCH_ITERATE(struct ipt_entry, e, fn, ## args) /* fn returns 0 to continue iteration */ #define IPT_ENTRY_ITERATE(entries, size, fn, args...) \ XT_ENTRY_ITERATE(struct ipt_entry, entries, size, fn, ## args) #endif /* Yes, Virginia, you have to zero the padding. */ struct ipt_ip { /* Source and destination IP addr */ struct in_addr src, dst; /* Mask for src and dest IP addr */ struct in_addr smsk, dmsk; char iniface[IFNAMSIZ], outiface[IFNAMSIZ]; unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ]; /* Protocol, 0 = ANY */ __u16 proto; /* Flags word */ __u8 flags; /* Inverse flags */ __u8 invflags; }; /* Values for "flag" field in struct ipt_ip (general ip structure). */ #define IPT_F_FRAG 0x01 /* Set if rule is a fragment rule */ #define IPT_F_GOTO 0x02 /* Set if jump is a goto */ #define IPT_F_MASK 0x03 /* All possible flag bits mask. */ /* Values for "inv" field in struct ipt_ip. */ #define IPT_INV_VIA_IN 0x01 /* Invert the sense of IN IFACE. */ #define IPT_INV_VIA_OUT 0x02 /* Invert the sense of OUT IFACE */ #define IPT_INV_TOS 0x04 /* Invert the sense of TOS. */ #define IPT_INV_SRCIP 0x08 /* Invert the sense of SRC IP. */ #define IPT_INV_DSTIP 0x10 /* Invert the sense of DST OP. */ #define IPT_INV_FRAG 0x20 /* Invert the sense of FRAG. */ #define IPT_INV_PROTO XT_INV_PROTO #define IPT_INV_MASK 0x7F /* All possible flag bits mask. */ /* This structure defines each of the firewall rules. Consists of 3 parts which are 1) general IP header stuff 2) match specific stuff 3) the target to perform if the rule matches */ struct ipt_entry { struct ipt_ip ip; /* Mark with fields that we care about. */ unsigned int nfcache; /* Size of ipt_entry + matches */ __u16 target_offset; /* Size of ipt_entry + matches + target */ __u16 next_offset; /* Back pointer */ unsigned int comefrom; /* Packet and byte counters. */ struct xt_counters counters; /* The matches (if any), then the target. */ unsigned char elems[0]; }; /* * New IP firewall options for [gs]etsockopt at the RAW IP level. * Unlike BSD Linux inherits IP options so you don't have to use a raw * socket for this. Instead we check rights in the calls. * * ATTENTION: check linux/in.h before adding new number here. */ #define IPT_BASE_CTL 64 #define IPT_SO_SET_REPLACE (IPT_BASE_CTL) #define IPT_SO_SET_ADD_COUNTERS (IPT_BASE_CTL + 1) #define IPT_SO_SET_MAX IPT_SO_SET_ADD_COUNTERS #define IPT_SO_GET_INFO (IPT_BASE_CTL) #define IPT_SO_GET_ENTRIES (IPT_BASE_CTL + 1) #define IPT_SO_GET_REVISION_MATCH (IPT_BASE_CTL + 2) #define IPT_SO_GET_REVISION_TARGET (IPT_BASE_CTL + 3) #define IPT_SO_GET_MAX IPT_SO_GET_REVISION_TARGET /* ICMP matching stuff */ struct ipt_icmp { __u8 type; /* type to match */ __u8 code[2]; /* range of code */ __u8 invflags; /* Inverse flags */ }; /* Values for "inv" field for struct ipt_icmp. */ #define IPT_ICMP_INV 0x01 /* Invert the sense of type/code test */ /* The argument to IPT_SO_GET_INFO */ struct ipt_getinfo { /* Which table: caller fills this in. */ char name[XT_TABLE_MAXNAMELEN]; /* Kernel fills these in. */ /* Which hook entry points are valid: bitmask */ unsigned int valid_hooks; /* Hook entry points: one per netfilter hook. */ unsigned int hook_entry[NF_INET_NUMHOOKS]; /* Underflow points. */ unsigned int underflow[NF_INET_NUMHOOKS]; /* Number of entries */ unsigned int num_entries; /* Size of entries. */ unsigned int size; }; /* The argument to IPT_SO_SET_REPLACE. */ struct ipt_replace { /* Which table. */ char name[XT_TABLE_MAXNAMELEN]; /* Which hook entry points are valid: bitmask. You can't change this. */ unsigned int valid_hooks; /* Number of entries */ unsigned int num_entries; /* Total size of new entries */ unsigned int size; /* Hook entry points. */ unsigned int hook_entry[NF_INET_NUMHOOKS]; /* Underflow points. */ unsigned int underflow[NF_INET_NUMHOOKS]; /* Information about old entries: */ /* Number of counters (must be equal to current number of entries). */ unsigned int num_counters; /* The old entries' counters. */ struct xt_counters __user *counters; /* The entries (hang off end: not really an array). */ struct ipt_entry entries[0]; }; /* The argument to IPT_SO_GET_ENTRIES. */ struct ipt_get_entries { /* Which table: user fills this in. */ char name[XT_TABLE_MAXNAMELEN]; /* User fills this in: total entry size. */ unsigned int size; /* The entries. */ struct ipt_entry entrytable[0]; }; /* Helper functions */ static __inline__ struct xt_entry_target * ipt_get_target(struct ipt_entry *e) { return (struct xt_entry_target *)((char *)e + e->target_offset); } /* * Main firewall chains definitions and global var's definitions. */ #endif /* _UAPI_IPTABLES_H */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 // SPDX-License-Identifier: GPL-2.0-only #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/compiler.h> #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mman.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> #include <linux/userfaultfd_k.h> #include <linux/elf.h> #include <linux/elf-randomize.h> #include <linux/personality.h> #include <linux/random.h> #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> #include <linux/uaccess.h> #include "internal.h" /** * kfree_const - conditionally free memory * @x: pointer to the memory * * Function calls kfree only if @x is not in .rodata section. */ void kfree_const(const void *x) { if (!is_kernel_rodata((unsigned long)x)) kfree(x); } EXPORT_SYMBOL(kfree_const); /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrdup(const char *s, gfp_t gfp) { size_t len; char *buf; if (!s) return NULL; len = strlen(s) + 1; buf = kmalloc_track_caller(len, gfp); if (buf) memcpy(buf, s, len); return buf; } EXPORT_SYMBOL(kstrdup); /** * kstrdup_const - conditionally duplicate an existing const string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Strings allocated by kstrdup_const should be freed by kfree_const and * must not be passed to krealloc(). * * Return: source string if it is in .rodata section otherwise * fallback to kstrdup. */ const char *kstrdup_const(const char *s, gfp_t gfp) { if (is_kernel_rodata((unsigned long)s)) return s; return kstrdup(s, gfp); } EXPORT_SYMBOL(kstrdup_const); /** * kstrndup - allocate space for and copy an existing string * @s: the string to duplicate * @max: read at most @max chars from @s * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Use kmemdup_nul() instead if the size is known exactly. * * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { size_t len; char *buf; if (!s) return NULL; len = strnlen(s, max); buf = kmalloc_track_caller(len+1, gfp); if (buf) { memcpy(buf, s, len); buf[len] = '\0'; } return buf; } EXPORT_SYMBOL(kstrndup); /** * kmemdup - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use * * Return: newly allocated copy of @src or %NULL in case of error */ void *kmemdup(const void *src, size_t len, gfp_t gfp) { void *p; p = kmalloc_track_caller(len, gfp); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL(kmemdup); /** * kmemdup_nul - Create a NUL-terminated string from unterminated data * @s: The data to stringify * @len: The size of the data * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s with NUL-termination or %NULL in * case of error */ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { char *buf; if (!s) return NULL; buf = kmalloc_track_caller(len + 1, gfp); if (buf) { memcpy(buf, s, len); buf[len] = '\0'; } return buf; } EXPORT_SYMBOL(kmemdup_nul); /** * memdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ void *memdup_user(const void __user *src, size_t len) { void *p; p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(memdup_user); /** * vmemdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ void *vmemdup_user(const void __user *src, size_t len) { void *p; p = kvmalloc(len, GFP_USER); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kvfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(vmemdup_user); /** * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. * * Return: newly allocated copy of @s or an ERR_PTR() in case of error */ char *strndup_user(const char __user *s, long n) { char *p; long length; length = strnlen_user(s, n); if (!length) return ERR_PTR(-EFAULT); if (length > n) return ERR_PTR(-EINVAL); p = memdup_user(s, length); if (IS_ERR(p)) return p; p[length - 1] = '\0'; return p; } EXPORT_SYMBOL(strndup_user); /** * memdup_user_nul - duplicate memory region from user space and NUL-terminate * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. */ void *memdup_user_nul(const void __user *src, size_t len) { char *p; /* * Always use GFP_KERNEL, since copy_from_user() can sleep and * cause pagefault, which makes it pointless to use GFP_NOFS * or GFP_ATOMIC. */ p = kmalloc_track_caller(len + 1, GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } p[len] = '\0'; return p; } EXPORT_SYMBOL(memdup_user_nul); void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) { struct vm_area_struct *next; vma->vm_prev = prev; if (prev) { next = prev->vm_next; prev->vm_next = vma; } else { next = mm->mmap; mm->mmap = vma; } vma->vm_next = next; if (next) next->vm_prev = vma; } void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) { struct vm_area_struct *prev, *next; next = vma->vm_next; prev = vma->vm_prev; if (prev) prev->vm_next = next; else mm->mmap = next; if (next) next->vm_prev = prev; } /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { struct task_struct * __maybe_unused t = current; return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } #ifndef STACK_RND_MASK #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ #endif unsigned long randomize_stack_top(unsigned long stack_top) { unsigned long random_variable = 0; if (current->flags & PF_RANDOMIZE) { random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } #ifdef CONFIG_STACK_GROWSUP return PAGE_ALIGN(stack_top) + random_variable; #else return PAGE_ALIGN(stack_top) - random_variable; #endif } #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT unsigned long arch_randomize_brk(struct mm_struct *mm) { /* Is the current task 32bit ? */ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) return randomize_page(mm->brk, SZ_32M); return randomize_page(mm->brk, SZ_1G); } unsigned long arch_mmap_rnd(void) { unsigned long rnd; #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS if (is_compat_task()) rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); else #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } /* * Leave enough space between the mmap area and the stack to honour ulimit in * the face of randomisation. */ #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP / 6 * 5) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_guard_gap; /* Account for stack randomization if necessary */ if (current->flags & PF_RANDOMIZE) pad += (STACK_RND_MASK << PAGE_SHIFT); /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) gap += pad; if (gap < MIN_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; return PAGE_ALIGN(STACK_TOP - gap - rnd); } void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; } #endif /** * __account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * @task: task used to check RLIMIT_MEMLOCK * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped * * Assumes @task and @mm are valid (i.e. at least one reference on each), and * that mmap_lock is held as writer. * * Return: * * 0 on success * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim) { unsigned long locked_vm, limit; int ret = 0; mmap_assert_write_locked(mm); locked_vm = mm->locked_vm; if (inc) { if (!bypass_rlim) { limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked_vm + pages > limit) ret = -ENOMEM; } if (!ret) mm->locked_vm = locked_vm + pages; } else { WARN_ON_ONCE(pages > locked_vm); mm->locked_vm = locked_vm - pages; } pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : ""); return ret; } EXPORT_SYMBOL_GPL(__account_locked_vm); /** * account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against, may be NULL * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). * * Return: * * 0 on success, or if mm is NULL * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) { int ret; if (pages == 0 || !mm) return 0; mmap_write_lock(mm); ret = __account_locked_vm(mm, pages, inc, current, capable(CAP_IPC_LOCK)); mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL_GPL(account_locked_vm); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) { unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate, &uf); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } return ret; } unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) { if (unlikely(offset + PAGE_ALIGN(len) < offset)) return -EINVAL; if (unlikely(offset_in_page(offset))) return -EINVAL; return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); } EXPORT_SYMBOL(vm_mmap); /** * kvmalloc_node - attempt to allocate physically contiguous memory, but upon * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not * fall back to vmalloc. * * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) { gfp_t kmalloc_flags = flags; void *ret; /* * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) * so the given set of flags has to be compatible. */ if ((flags & GFP_KERNEL) != GFP_KERNEL) return kmalloc_node(size, flags, node); /* * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore * contribute to a long term fragmentation less than vmalloc fallback. * However make sure that larger requests are not too disruptive - no * OOM killer and no allocation failure warnings as we have a fallback. */ if (size > PAGE_SIZE) { kmalloc_flags |= __GFP_NOWARN; if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; } ret = kmalloc_node(size, kmalloc_flags, node); /* * It doesn't really make sense to fallback to vmalloc for sub page * requests */ if (ret || size <= PAGE_SIZE) return ret; /* Don't even allow crazy sizes */ if (WARN_ON_ONCE(size > INT_MAX)) return NULL; return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); /** * kvfree() - Free memory. * @addr: Pointer to allocated memory. * * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). * It is slightly more efficient to use kfree() or vfree() if you are certain * that you know which one to use. * * Context: Either preemptible task context or not-NMI interrupt. */ void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) vfree(addr); else kfree(addr); } EXPORT_SYMBOL(kvfree); /** * kvfree_sensitive - Free a data object containing sensitive information. * @addr: address of the data object to be freed. * @len: length of the data object. * * Use the special memzero_explicit() function to clear the content of a * kvmalloc'ed object containing sensitive data to make sure that the * compiler won't optimize out the data clearing. */ void kvfree_sensitive(const void *addr, size_t len) { if (likely(!ZERO_OR_NULL_PTR(addr))) { memzero_explicit((void *)addr, len); kvfree(addr); } } EXPORT_SYMBOL(kvfree_sensitive); static inline void *__page_rmapping(struct page *page) { unsigned long mapping; mapping = (unsigned long)page->mapping; mapping &= ~PAGE_MAPPING_FLAGS; return (void *)mapping; } /* Neutral page->mapping pointer to address_space or anon_vma or other */ void *page_rmapping(struct page *page) { page = compound_head(page); return __page_rmapping(page); } /* * Return true if this page is mapped into pagetables. * For compound page it returns true if any subpage of compound page is mapped. */ bool page_mapped(struct page *page) { int i; if (likely(!PageCompound(page))) return atomic_read(&page->_mapcount) >= 0; page = compound_head(page); if (atomic_read(compound_mapcount_ptr(page)) >= 0) return true; if (PageHuge(page)) return false; for (i = 0; i < compound_nr(page); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } return false; } EXPORT_SYMBOL(page_mapped); struct anon_vma *page_anon_vma(struct page *page) { unsigned long mapping; page = compound_head(page); mapping = (unsigned long)page->mapping; if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) return NULL; return __page_rmapping(page); } struct address_space *page_mapping(struct page *page) { struct address_space *mapping; page = compound_head(page); /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(PageSlab(page))) return NULL; if (unlikely(PageSwapCache(page))) { swp_entry_t entry; entry.val = page_private(page); return swap_address_space(entry); } mapping = page->mapping; if ((unsigned long)mapping & PAGE_MAPPING_ANON) return NULL; return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); } EXPORT_SYMBOL(page_mapping); /* * For file cache pages, return the address_space, otherwise return NULL */ struct address_space *page_mapping_file(struct page *page) { if (unlikely(PageSwapCache(page))) return NULL; return page_mapping(page); } /* Slow path of page_mapcount() for compound pages */ int __page_mapcount(struct page *page) { int ret; ret = atomic_read(&page->_mapcount) + 1; /* * For file THP page->_mapcount contains total number of mapping * of the page: no need to look into compound_mapcount. */ if (!PageAnon(page) && !PageHuge(page)) return ret; page = compound_head(page); ret += atomic_read(compound_mapcount_ptr(page)) + 1; if (PageDoubleMap(page)) ret--; return ret; } EXPORT_SYMBOL_GPL(__page_mapcount); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; int sysctl_overcommit_ratio __read_mostly = 50; unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_kbytes = 0; return ret; } static void sync_overcommit_as(struct work_struct *dummy) { percpu_counter_sync(&vm_committed_as); } int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int new_policy = -1; int ret; /* * The deviation of sync_overcommit_as could be big with loose policy * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply * with the strict "NEVER", and to avoid possible race condtion (even * though user usually won't too frequently do the switching to policy * OVERCOMMIT_NEVER), the switch is done in the following order: * 1. changing the batch * 2. sync percpu count on each CPU * 3. switch the policy */ if (write) { t = *table; t.data = &new_policy; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (ret || new_policy == -1) return ret; mm_compute_batch(new_policy); if (new_policy == OVERCOMMIT_NEVER) schedule_on_each_cpu(sync_overcommit_as); sysctl_overcommit_memory = new_policy; } else { ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } return ret; } int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_ratio = 0; return ret; } /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ unsigned long vm_commit_limit(void) { unsigned long allowed; if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; return allowed; } /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. */ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; /* * The global memory commitment made in the system can be a metric * that can be used to drive ballooning decisions when Linux is hosted * as a guest. On Hyper-V, the host implements a policy engine for dynamically * balancing memory across competing virtual machines that are hosted. * Several metrics drive this policy engine including the guest reported * memory commitment. * * The time cost of this is very low for small platforms, and for big * platform like a 2S/36C/72T Skylake server, in worst case where * vm_committed_as's spinlock is under severe contention, the time cost * could be about 30~40 microseconds. */ unsigned long vm_memory_committed(void) { return percpu_counter_sum_positive(&vm_committed_as); } EXPORT_SYMBOL_GPL(vm_memory_committed); /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. * * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. * * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; vm_acct_memory(pages); /* * Sometimes we want to use more memory than we have */ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { if (pages > totalram_pages() + total_swap_pages) goto error; return 0; } allowed = vm_commit_limit(); /* * Reserve some for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); /* * Don't let a single process grow so big a user can't recover */ if (mm) { long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); allowed -= min_t(long, mm->total_vm / 32, reserve); } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: vm_unacct_memory(pages); return -ENOMEM; } /** * get_cmdline() - copy the cmdline value to a buffer. * @task: the task whose cmdline value to copy. * @buffer: the buffer to copy to. * @buflen: the length of the buffer. Larger cmdline values are truncated * to this length. * * Return: the size of the cmdline field copied. Note that the copy does * not guarantee an ending NULL byte. */ int get_cmdline(struct task_struct *task, char *buffer, int buflen) { int res = 0; unsigned int len; struct mm_struct *mm = get_task_mm(task); unsigned long arg_start, arg_end, env_start, env_end; if (!mm) goto out; if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); len = arg_end - arg_start; if (len > buflen) len = buflen; res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); /* * If the nul at the end of args has been overwritten, then * assume application is using setproctitle(3). */ if (res > 0 && buffer[res-1] != '\0' && len < buflen) { len = strnlen(buffer, res); if (len < res) { res = len; } else { len = env_end - env_start; if (len > buflen - res) len = buflen - res; res += access_process_vm(task, env_start, buffer+res, len, FOLL_FORCE); res = strnlen(buffer, res); } } out_mm: mmput(mm); out: return res; } int __weak memcmp_pages(struct page *page1, struct page *page2) { char *addr1, *addr2; int ret; addr1 = kmap_atomic(page1); addr2 = kmap_atomic(page2); ret = memcmp(addr1, addr2, PAGE_SIZE); kunmap_atomic(addr2); kunmap_atomic(addr1); return ret; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the TCP protocol. * * Version: @(#)tcp.h 1.0.2 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_TCP_H #define _LINUX_TCP_H #include <linux/skbuff.h> #include <linux/win_minmax.h> #include <net/sock.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> #include <uapi/linux/tcp.h> static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb) { return (struct tcphdr *)skb_transport_header(skb); } static inline unsigned int __tcp_hdrlen(const struct tcphdr *th) { return th->doff * 4; } static inline unsigned int tcp_hdrlen(const struct sk_buff *skb) { return __tcp_hdrlen(tcp_hdr(skb)); } static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb) { return (struct tcphdr *)skb_inner_transport_header(skb); } static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb) { return inner_tcp_hdr(skb)->doff * 4; } static inline unsigned int tcp_optlen(const struct sk_buff *skb) { return (tcp_hdr(skb)->doff - 5) * 4; } /* TCP Fast Open */ #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ #define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */ /* TCP Fast Open Cookie as stored in memory */ struct tcp_fastopen_cookie { __le64 val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))]; s8 len; bool exp; /* In RFC6994 experimental option format */ }; /* This defines a selective acknowledgement block. */ struct tcp_sack_block_wire { __be32 start_seq; __be32 end_seq; }; struct tcp_sack_block { u32 start_seq; u32 end_seq; }; /*These are used to set the sack_ok field in struct tcp_options_received */ #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ struct tcp_options_received { /* PAWS/RTTM data */ int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ u32 ts_recent; /* Time stamp to echo next */ u32 rcv_tsval; /* Time stamp value */ u32 rcv_tsecr; /* Time stamp echo reply */ u16 saw_tstamp : 1, /* Saw TIMESTAMP on last packet */ tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */ dsack : 1, /* D-SACK is scheduled */ wscale_ok : 1, /* Wscale seen on SYN packet */ sack_ok : 3, /* SACK seen on SYN packet */ smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ u8 saw_unknown:1, /* Received unknown option */ unused:7; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ }; static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { rx_opt->tstamp_ok = rx_opt->sack_ok = 0; rx_opt->wscale_ok = rx_opt->snd_wscale = 0; #if IS_ENABLED(CONFIG_SMC) rx_opt->smc_ok = 0; #endif } /* This is the max number of SACKS that we'll generate and process. It's safe * to increase this, although since: * size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8) * only four options will fit in a standard TCP header */ #define TCP_NUM_SACKS 4 struct tcp_request_sock_ops; struct tcp_request_sock { struct inet_request_sock req; const struct tcp_request_sock_ops *af_specific; u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; bool is_mptcp; #if IS_ENABLED(CONFIG_MPTCP) bool drop_req; #endif u32 txhash; u32 rcv_isn; u32 snt_isn; u32 ts_off; u32 last_oow_ack_time; /* last SYNACK */ u32 rcv_nxt; /* the ack # by SYNACK. For * FastOpen it's the seq# * after data-in-SYN. */ u8 syn_tos; }; static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) { return (struct tcp_request_sock *)req; } struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; u16 tcp_header_len; /* Bytes of tcp header to send */ u16 gso_segs; /* Max number of segs per GSO packet */ /* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags; /* * RFC793 variables by their proper names. This means you can * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived * sum(delta(rcv_nxt)), or how many bytes * were acked. */ u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn * total number of segments in. */ u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn * total number of data segments in. */ u32 rcv_nxt; /* What we want to receive next */ u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 snd_nxt; /* Next sequence we send */ u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut * The total number of segments sent. */ u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut * total number of data segments sent. */ u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut * total number of data bytes sent. */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. */ u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups * total number of DSACK blocks received */ u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ u32 compressed_ack_rcv_nxt; u32 tsoffset; /* timestamp offset */ struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ u32 snd_wl1; /* Sequence for window update */ u32 snd_wnd; /* The window we expect to receive */ u32 max_window; /* Maximal window ever seen from peer */ u32 mss_cache; /* Cached effective mss, not including SACKS */ u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ /* Information of the most recently (s)acked skb */ struct tcp_rack { u64 mstamp; /* (Re)sent time of the skb */ u32 rtt_us; /* Associated RTT */ u32 end_seq; /* Ending TCP sequence of the skb */ u32 last_delivered; /* tp->delivered at last reo_wnd adj */ u8 reo_wnd_steps; /* Allowed reordering window */ #define TCP_RACK_RECOVERY_THRESH 16 u8 reo_wnd_persist:5, /* No. of recovery since last adj */ dsack_seen:1, /* Whether DSACK seen after last adj */ advanced:1; /* mstamp advanced since last lost marking */ } rack; u16 advmss; /* Advertised MSS */ u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ unused:5; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ fastopen_client_fail:2; /* reason why fastopen failed */ u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; u8 save_syn:2, /* Save headers of SYN packet */ syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ /* RTT measurement */ u64 tcp_mstamp; /* most recent packet received/sent */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 rttvar_us; /* smoothed mdev_max */ u32 rtt_seq; /* sequence number to update rttvar */ struct minmax rtt_min; u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ u32 max_packets_out; /* max packets_out in last window */ u32 max_packets_seq; /* right edge of max_packets_out flight */ u16 urg_data; /* Saved octet of OOB data and control flags */ u8 ecn_flags; /* ECN status bits. */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 reordering; /* Packet reordering metric. */ u32 reord_seen; /* number of data packet reordering events */ u32 snd_up; /* Urgent pointer */ /* * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ u32 snd_ssthresh; /* Slow start size threshold */ u32 snd_cwnd; /* Sending congestion window */ u32 snd_cwnd_cnt; /* Linear increase counter */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; u32 snd_cwnd_stamp; u32 prior_cwnd; /* cwnd right before starting loss recovery */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 lost; /* Total data packets lost incl. rexmits */ u32 app_limited; /* limited until "delivered" reaches this val */ u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ struct hrtimer pacing_timer; struct hrtimer compressed_ack_timer; /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; struct sk_buff *retransmit_skb_hint; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ struct tcp_sack_block recv_sack_cache[4]; struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set * (validity guaranteed only if * sacked_out > 0) */ int lost_cnt_hint; u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans * Total data bytes retransmitted */ u32 total_retrans; /* Total retransmits for entire connection */ u32 urg_seq; /* Seq of received urgent pointer */ unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; /* Sock_ops bpf program related variables */ #ifdef CONFIG_BPF u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs * values defined in uapi/linux/tcp.h */ #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) #else #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 #endif u16 timeout_rehash; /* Timeout-triggered rehash attempts */ u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ /* Receiver side RTT estimation */ u32 rcv_rtt_last_tsecr; struct { u32 rtt_us; u32 seq; u64 time; } rcv_rtt_est; /* Receiver queue space */ struct { u32 space; u32 seq; u64 time; } rcvq_space; /* TCP-specific MTU probe information. */ struct { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) bool syn_smc; /* SYN includes SMC */ #endif #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ const struct tcp_sock_af_ops *af_specific; /* TCP MD5 Signature Option information */ struct tcp_md5sig_info __rcu *md5sig_info; #endif /* TCP fastopen related information */ struct tcp_fastopen_request *fastopen_req; /* fastopen_rsk points to request_sock that resulted in this big * socket. Used to retransmit SYNACKs etc. */ struct request_sock __rcu *fastopen_rsk; struct saved_syn *saved_syn; }; enum tsq_enum { TSQ_THROTTLED, TSQ_QUEUED, TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */ TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */ TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */ TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call * tcp_v{4|6}_mtu_reduced() */ }; enum tsq_flags { TSQF_THROTTLED = (1UL << TSQ_THROTTLED), TSQF_QUEUED = (1UL << TSQ_QUEUED), TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED), TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED), TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED), TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED), }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) { return (struct tcp_sock *)sk; } struct tcp_timewait_sock { struct inet_timewait_sock tw_sk; #define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt #define tw_snd_nxt tw_sk.__tw_common.skc_tw_snd_nxt u32 tw_rcv_wnd; u32 tw_ts_offset; u32 tw_ts_recent; /* The time we sent the last out-of-window ACK: */ u32 tw_last_oow_ack_time; int tw_ts_recent_stamp; u32 tw_tx_delay; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif }; static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) { return (struct tcp_timewait_sock *)sk; } static inline bool tcp_passive_fastopen(const struct sock *sk) { return sk->sk_state == TCP_SYN_RECV && rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL; } static inline void fastopen_queue_tune(struct sock *sk, int backlog) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn); queue->fastopenq.max_qlen = min_t(unsigned int, backlog, somaxconn); } static inline void tcp_move_syn(struct tcp_sock *tp, struct request_sock *req) { tp->saved_syn = req->saved_syn; req->saved_syn = NULL; } static inline void tcp_saved_syn_free(struct tcp_sock *tp) { kfree(tp->saved_syn); tp->saved_syn = NULL; } static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) { return saved_syn->mac_hdrlen + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb); static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) { /* We use READ_ONCE() here because socket might not be locked. * This happens for listeners. */ u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); return (user_mss && user_mss < mss) ? user_mss : mss; } int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void tcp_sock_set_cork(struct sock *sk, bool on); int tcp_sock_set_keepcnt(struct sock *sk, int val); int tcp_sock_set_keepidle_locked(struct sock *sk, int val); int tcp_sock_set_keepidle(struct sock *sk, int val); int tcp_sock_set_keepintvl(struct sock *sk, int val); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); void tcp_sock_set_user_timeout(struct sock *sk, u32 val); #endif /* _LINUX_TCP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_UNALIGNED_ACCESS_OK_H #define _LINUX_UNALIGNED_ACCESS_OK_H #include <linux/kernel.h> #include <asm/byteorder.h> static __always_inline u16 get_unaligned_le16(const void *p) { return le16_to_cpup((__le16 *)p); } static __always_inline u32 get_unaligned_le32(const void *p) { return le32_to_cpup((__le32 *)p); } static __always_inline u64 get_unaligned_le64(const void *p) { return le64_to_cpup((__le64 *)p); } static __always_inline u16 get_unaligned_be16(const void *p) { return be16_to_cpup((__be16 *)p); } static __always_inline u32 get_unaligned_be32(const void *p) { return be32_to_cpup((__be32 *)p); } static __always_inline u64 get_unaligned_be64(const void *p) { return be64_to_cpup((__be64 *)p); } static __always_inline void put_unaligned_le16(u16 val, void *p) { *((__le16 *)p) = cpu_to_le16(val); } static __always_inline void put_unaligned_le32(u32 val, void *p) { *((__le32 *)p) = cpu_to_le32(val); } static __always_inline void put_unaligned_le64(u64 val, void *p) { *((__le64 *)p) = cpu_to_le64(val); } static __always_inline void put_unaligned_be16(u16 val, void *p) { *((__be16 *)p) = cpu_to_be16(val); } static __always_inline void put_unaligned_be32(u32 val, void *p) { *((__be32 *)p) = cpu_to_be32(val); } static __always_inline void put_unaligned_be64(u64 val, void *p) { *((__be64 *)p) = cpu_to_be64(val); } #endif /* _LINUX_UNALIGNED_ACCESS_OK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UNWIND_H #define _ASM_X86_UNWIND_H #include <linux/sched.h> #include <linux/ftrace.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> #define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) #define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; struct task_struct *task; int graph_idx; bool error; #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs, *prev_regs; #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; /* * If non-NULL: The current frame is incomplete and doesn't contain a * valid BP. When looking for the next frame, use this instead of the * non-existent saved BP. */ unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; #endif }; void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); bool unwind_next_frame(struct unwind_state *state); unsigned long unwind_get_return_address(struct unwind_state *state); unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } static inline bool unwind_error(struct unwind_state *state) { return state->error; } static inline void unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { first_frame = first_frame ? : get_stack_pointer(task, regs); __unwind_start(state, task, regs, first_frame); } #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) /* * If 'partial' returns true, only the iret frame registers are valid. */ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { if (unwind_done(state)) return NULL; if (partial) { #ifdef CONFIG_UNWINDER_ORC *partial = !state->full_regs; #else *partial = false; #endif } return state->regs; } #else static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { return NULL; } #endif #ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); #else static inline void unwind_init(void) {} static inline void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} #endif /* * This disables KASAN checking when reading a value from another task's stack, * since the other task could be running on another CPU and could have poisoned * the stack in the meantime. */ #define READ_ONCE_TASK_STACK(task, x) \ ({ \ unsigned long val; \ if (task == current) \ val = READ_ONCE(x); \ else \ val = READ_ONCE_NOCHECK(x); \ val; \ }) static inline bool task_on_another_cpu(struct task_struct *task) { #ifdef CONFIG_SMP return task != current && task->on_cpu; #else return false; #endif } #endif /* _ASM_X86_UNWIND_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_NOTIFY_H #define _LINUX_FS_NOTIFY_H /* * include/linux/fsnotify.h - generic hooks for filesystem notification, to * reduce in-source duplication from both dnotify and inotify. * * We don't compile any of this away in some complicated menagerie of ifdefs. * Instead, we rely on the code inside to optimize away as needed. * * (C) Copyright 2005 Robert Love */ #include <linux/fsnotify_backend.h> #include <linux/audit.h> #include <linux/slab.h> #include <linux/bug.h> /* * Notify this @dir inode about a change in a child directory entry. * The directory entry may have turned positive or negative or its inode may * have changed (i.e. renamed over). * * Unlike fsnotify_parent(), the event will be reported regardless of the * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only * the child is interested and not the parent. */ static inline void fsnotify_name(struct inode *dir, __u32 mask, struct inode *child, const struct qstr *name, u32 cookie) { fsnotify(mask, child, FSNOTIFY_EVENT_INODE, dir, name, NULL, cookie); } static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, __u32 mask) { fsnotify_name(dir, mask, d_inode(dentry), &dentry->d_name, 0); } static inline void fsnotify_inode(struct inode *inode, __u32 mask) { if (S_ISDIR(inode->i_mode)) mask |= FS_ISDIR; fsnotify(mask, inode, FSNOTIFY_EVENT_INODE, NULL, NULL, inode, 0); } /* Notify this dentry's parent about a child's events. */ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { struct inode *inode = d_inode(dentry); if (S_ISDIR(inode->i_mode)) { mask |= FS_ISDIR; /* sb/mount marks are not interested in name of directory */ if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) goto notify_child; } /* disconnected dentry cannot notify parent */ if (IS_ROOT(dentry)) goto notify_child; return __fsnotify_parent(dentry, mask, data, data_type); notify_child: return fsnotify(mask, data, data_type, NULL, NULL, inode, 0); } /* * Simple wrappers to consolidate calls to fsnotify_parent() when an event * is on a file/dentry. */ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) { fsnotify_parent(dentry, mask, d_inode(dentry), FSNOTIFY_EVENT_INODE); } static inline int fsnotify_file(struct file *file, __u32 mask) { const struct path *path = &file->f_path; if (file->f_mode & FMODE_NONOTIFY) return 0; return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } /* Simple call site for access decisions */ static inline int fsnotify_perm(struct file *file, int mask) { int ret; __u32 fsnotify_mask = 0; if (!(mask & (MAY_READ | MAY_OPEN))) return 0; if (mask & MAY_OPEN) { fsnotify_mask = FS_OPEN_PERM; if (file->f_flags & __FMODE_EXEC) { ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); if (ret) return ret; } } else if (mask & MAY_READ) { fsnotify_mask = FS_ACCESS_PERM; } return fsnotify_file(file, fsnotify_mask); } /* * fsnotify_link_count - inode's link count changed */ static inline void fsnotify_link_count(struct inode *inode) { fsnotify_inode(inode, FS_ATTRIB); } /* * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir */ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, const struct qstr *old_name, int isdir, struct inode *target, struct dentry *moved) { struct inode *source = moved->d_inode; u32 fs_cookie = fsnotify_get_cookie(); __u32 old_dir_mask = FS_MOVED_FROM; __u32 new_dir_mask = FS_MOVED_TO; const struct qstr *new_name = &moved->d_name; if (old_dir == new_dir) old_dir_mask |= FS_DN_RENAME; if (isdir) { old_dir_mask |= FS_ISDIR; new_dir_mask |= FS_ISDIR; } fsnotify_name(old_dir, old_dir_mask, source, old_name, fs_cookie); fsnotify_name(new_dir, new_dir_mask, source, new_name, fs_cookie); if (target) fsnotify_link_count(target); fsnotify_inode(source, FS_MOVE_SELF); audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE); } /* * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed */ static inline void fsnotify_inode_delete(struct inode *inode) { __fsnotify_inode_delete(inode); } /* * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed */ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt) { __fsnotify_vfsmount_delete(mnt); } /* * fsnotify_inoderemove - an inode is going away */ static inline void fsnotify_inoderemove(struct inode *inode) { fsnotify_inode(inode, FS_DELETE_SELF); __fsnotify_inode_delete(inode); } /* * fsnotify_create - 'name' was linked in */ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry) { audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_dirent(inode, dentry, FS_CREATE); } /* * fsnotify_link - new hardlink in 'inode' directory * Note: We have to pass also the linked inode ptr as some filesystems leave * new_dentry->d_inode NULL and instantiate inode pointer later */ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry) { fsnotify_link_count(inode); audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_name(dir, FS_CREATE, inode, &new_dentry->d_name, 0); } /* * fsnotify_unlink - 'name' was unlinked * * Caller must make sure that dentry->d_name is stable. */ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry) { /* Expected to be called before d_delete() */ WARN_ON_ONCE(d_is_negative(dentry)); fsnotify_dirent(dir, dentry, FS_DELETE); } /* * fsnotify_mkdir - directory 'name' was created */ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) { audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_dirent(inode, dentry, FS_CREATE | FS_ISDIR); } /* * fsnotify_rmdir - directory 'name' was removed * * Caller must make sure that dentry->d_name is stable. */ static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry) { /* Expected to be called before d_delete() */ WARN_ON_ONCE(d_is_negative(dentry)); fsnotify_dirent(dir, dentry, FS_DELETE | FS_ISDIR); } /* * fsnotify_access - file was read */ static inline void fsnotify_access(struct file *file) { fsnotify_file(file, FS_ACCESS); } /* * fsnotify_modify - file was modified */ static inline void fsnotify_modify(struct file *file) { fsnotify_file(file, FS_MODIFY); } /* * fsnotify_open - file was opened */ static inline void fsnotify_open(struct file *file) { __u32 mask = FS_OPEN; if (file->f_flags & __FMODE_EXEC) mask |= FS_OPEN_EXEC; fsnotify_file(file, mask); } /* * fsnotify_close - file was closed */ static inline void fsnotify_close(struct file *file) { __u32 mask = (file->f_mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE; fsnotify_file(file, mask); } /* * fsnotify_xattr - extended attributes were changed */ static inline void fsnotify_xattr(struct dentry *dentry) { fsnotify_dentry(dentry, FS_ATTRIB); } /* * fsnotify_change - notify_change event. file was modified and/or metadata * was changed. */ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid) { __u32 mask = 0; if (ia_valid & ATTR_UID) mask |= FS_ATTRIB; if (ia_valid & ATTR_GID) mask |= FS_ATTRIB; if (ia_valid & ATTR_SIZE) mask |= FS_MODIFY; /* both times implies a utime(s) call */ if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME)) mask |= FS_ATTRIB; else if (ia_valid & ATTR_ATIME) mask |= FS_ACCESS; else if (ia_valid & ATTR_MTIME) mask |= FS_MODIFY; if (ia_valid & ATTR_MODE) mask |= FS_ATTRIB; if (mask) fsnotify_dentry(dentry, mask); } #endif /* _LINUX_FS_NOTIFY_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 /* SPDX-License-Identifier: GPL-2.0 */ /* * Variant of atomic_t specialized for reference counts. * * The interface matches the atomic_t interface (to aid in porting) but only * provides the few functions one should use for reference counting. * * Saturation semantics * ==================== * * refcount_t differs from atomic_t in that the counter saturates at * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the * counter and causing 'spurious' use-after-free issues. In order to avoid the * cost associated with introducing cmpxchg() loops into all of the saturating * operations, we temporarily allow the counter to take on an unchecked value * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow * or overflow has occurred. Although this is racy when multiple threads * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly * equidistant from 0 and INT_MAX we minimise the scope for error: * * INT_MAX REFCOUNT_SATURATED UINT_MAX * 0 (0x7fff_ffff) (0xc000_0000) (0xffff_ffff) * +--------------------------------+----------------+----------------+ * <---------- bad value! ----------> * * (in a signed view of the world, the "bad value" range corresponds to * a negative counter value). * * As an example, consider a refcount_inc() operation that causes the counter * to overflow: * * int old = atomic_fetch_add_relaxed(r); * // old is INT_MAX, refcount now INT_MIN (0x8000_0000) * if (old < 0) * atomic_set(r, REFCOUNT_SATURATED); * * If another thread also performs a refcount_inc() operation between the two * atomic operations, then the count will continue to edge closer to 0. If it * reaches a value of 1 before /any/ of the threads reset it to the saturated * value, then a concurrent refcount_dec_and_test() may erroneously free the * underlying object. * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK). * With the current PID limit, if no batched refcounting operations are used and * the attacker can't repeatedly trigger kernel oopses in the middle of refcount * operations, this makes it impossible for a saturated refcount to leave the * saturation range, even if it is possible for multiple uses of the same * refcount to nest in the context of a single task: * * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT = * 0x40000000 / 0x400000 = 0x100 = 256 * * If hundreds of references are added/removed with a single refcounting * operation, it may potentially be possible to leave the saturation range; but * given the precise timing details involved with the round-robin scheduling of * each thread manipulating the refcount and the need to hit the race multiple * times in succession, there doesn't appear to be a practical avenue of attack * even if using refcount_add() operations with larger increments. * * Memory ordering * =============== * * Memory ordering rules are slightly relaxed wrt regular atomic_t functions * and provide only what is strictly required for refcounts. * * The increments are fully relaxed; these will not provide ordering. The * rationale is that whatever is used to obtain the object we're increasing the * reference count on will provide the ordering. For locked data structures, * its the lock acquire, for RCU/lockless data structures its the dependent * load. * * Do note that inc_not_zero() provides a control dependency which will order * future stores against the inc, this ensures we'll never modify the object * if we did not in fact acquire a reference. * * The decrements will provide release order, such that all the prior loads and * stores will be issued before, it also provides a control dependency, which * will order us against the subsequent free(). * * The control dependency is against the load of the cmpxchg (ll/sc) that * succeeded. This means the stores aren't fully ordered, but this is fine * because the 1->0 transition indicates no concurrency. * * Note that the allocator is responsible for ordering things between free() * and alloc(). * * The decrements dec_and_test() and sub_and_test() also provide acquire * ordering on success. * */ #ifndef _LINUX_REFCOUNT_H #define _LINUX_REFCOUNT_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/limits.h> #include <linux/spinlock_types.h> struct mutex; /** * struct refcount_t - variant of atomic_t specialized for reference counts * @refs: atomic_t counter field * * The counter saturates at REFCOUNT_SATURATED and will not move once * there. This avoids wrapping the counter and causing 'spurious' * use-after-free bugs. */ typedef struct refcount_struct { atomic_t refs; } refcount_t; #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } #define REFCOUNT_MAX INT_MAX #define REFCOUNT_SATURATED (INT_MIN / 2) enum refcount_saturation_type { REFCOUNT_ADD_NOT_ZERO_OVF, REFCOUNT_ADD_OVF, REFCOUNT_ADD_UAF, REFCOUNT_SUB_UAF, REFCOUNT_DEC_LEAK, }; void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t); /** * refcount_set - set a refcount's value * @r: the refcount * @n: value to which the refcount will be set */ static inline void refcount_set(refcount_t *r, int n) { atomic_set(&r->refs, n); } /** * refcount_read - get a refcount's value * @r: the refcount * * Return: the refcount's value */ static inline unsigned int refcount_read(const refcount_t *r) { return atomic_read(&r->refs); } static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); do { if (!old) break; } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i)); if (oldp) *oldp = old; if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); return old; } /** * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount * @r: the refcount * * Will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. * * Return: false if the passed refcount is 0, true otherwise */ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) { return __refcount_add_not_zero(i, r, NULL); } static inline void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); if (oldp) *oldp = old; if (unlikely(!old)) refcount_warn_saturate(r, REFCOUNT_ADD_UAF); else if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_OVF); } /** * refcount_add - add a value to a refcount * @i: the value to add to the refcount * @r: the refcount * * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. */ static inline void refcount_add(int i, refcount_t *r) { __refcount_add(i, r, NULL); } static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) { return __refcount_add_not_zero(1, r, oldp); } /** * refcount_inc_not_zero - increment a refcount unless it is 0 * @r: the refcount to increment * * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED * and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Return: true if the increment was successful, false otherwise */ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) { return __refcount_inc_not_zero(r, NULL); } static inline void __refcount_inc(refcount_t *r, int *oldp) { __refcount_add(1, r, oldp); } /** * refcount_inc - increment a refcount * @r: the refcount to increment * * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller already has a * reference on the object. * * Will WARN if the refcount is 0, as this represents a possible use-after-free * condition. */ static inline void refcount_inc(refcount_t *r) { __refcount_inc(r, NULL); } static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(i, &r->refs); if (oldp) *oldp = old; if (old == i) { smp_acquire__after_ctrl_dep(); return true; } if (unlikely(old < 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF); return false; } /** * refcount_sub_and_test - subtract from a refcount and test if it is 0 * @i: amount to subtract from the refcount * @r: the refcount * * Similar to atomic_dec_and_test(), but it will WARN, return false and * ultimately leak on underflow and will fail to decrement when saturated * at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_dec(), or one of its variants, should instead be used to * decrement a reference count. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) { return __refcount_sub_and_test(i, r, NULL); } static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) { return __refcount_sub_and_test(1, r, oldp); } /** * refcount_dec_and_test - decrement a refcount and test if it is 0 * @r: the refcount * * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_dec_and_test(refcount_t *r) { return __refcount_dec_and_test(r, NULL); } static inline void __refcount_dec(refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(1, &r->refs); if (oldp) *oldp = old; if (unlikely(old <= 1)) refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); } /** * refcount_dec - decrement a refcount * @r: the refcount * * Similar to atomic_dec(), it will WARN on underflow and fail to decrement * when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before. */ static inline void refcount_dec(refcount_t *r) { __refcount_dec(r, NULL); } extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock); extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock); extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags); #endif /* _LINUX_REFCOUNT_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_CPUMASK_H #define __LINUX_CPUMASK_H /* * Cpumasks provide a bitmap suitable for representing the * set of CPU's in a system, one bit position per CPU number. In general, * only nr_cpu_ids (<= NR_CPUS) bits are valid. */ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/bitmap.h> #include <linux/atomic.h> #include <linux/bug.h> /* Don't assign or return these: may not be this big! */ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; /** * cpumask_bits - get the bits in a cpumask * @maskp: the struct cpumask * * * You should only assume nr_cpu_ids bits of this mask are valid. This is * a macro so it's const-correct. */ #define cpumask_bits(maskp) ((maskp)->bits) /** * cpumask_pr_args - printf args to output a cpumask * @maskp: cpumask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a cpumask. */ #define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) #if NR_CPUS == 1 #define nr_cpu_ids 1U #else extern unsigned int nr_cpu_ids; #endif #ifdef CONFIG_CPUMASK_OFFSTACK /* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, * not all bits may be allocated. */ #define nr_cpumask_bits nr_cpu_ids #else #define nr_cpumask_bits ((unsigned int)NR_CPUS) #endif /* * The following particular system cpumasks and operations manage * possible, present, active and online cpus. * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * * The cpu_possible_mask is fixed at boot time, as the set of CPU id's * that it is possible might ever be plugged in at anytime during the * life of that system boot. The cpu_present_mask is dynamic(*), * representing which CPUs are currently plugged in. And * cpu_online_mask is the dynamic subset of cpu_present_mask, * indicating those CPUs available for scheduling. * * If HOTPLUG is enabled, then cpu_possible_mask is forced to have * all NR_CPUS bits set, otherwise it is just the set of CPUs that * ACPI reports present at boot. * * If HOTPLUG is enabled, then cpu_present_mask varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise * cpu_present_mask is just a copy of cpu_possible_mask. * * (*) Well, cpu_present_mask is dynamic in the hotplug case. If not * hotplug, it's a copy of cpu_possible_mask, hence fixed at boot. * * Subtleties: * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode * assumption that their single CPU is online. The UP * cpu_{online,possible,present}_masks are placebos. Changing them * will have no useful affect on the following num_*_cpus() * and cpu_*() macros in the UP case. This ugliness is a UP * optimization - don't waste any instructions or memory references * asking if you're online or how many CPUs there are if there is * only one CPU. */ extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) extern atomic_t __num_online_cpus; #if NR_CPUS > 1 /** * num_online_cpus() - Read the number of online CPUs * * Despite the fact that __num_online_cpus is of type atomic_t, this * interface gives only a momentary snapshot and is not protected against * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. */ static inline unsigned int num_online_cpus(void) { return atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) #define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask) #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) #define cpu_active(cpu) cpumask_test_cpu((cpu), cpu_active_mask) #else #define num_online_cpus() 1U #define num_possible_cpus() 1U #define num_present_cpus() 1U #define num_active_cpus() 1U #define cpu_online(cpu) ((cpu) == 0) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) #define cpu_active(cpu) ((cpu) == 0) #endif extern cpumask_t cpus_booted_once_mask; static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ } /* verify cpu argument to cpumask_* operators */ static inline unsigned int cpumask_check(unsigned int cpu) { cpu_max_bits_warn(cpu, nr_cpumask_bits); return cpu; } #if NR_CPUS == 1 /* Uniprocessor. Assume all masks are "1". */ static inline unsigned int cpumask_first(const struct cpumask *srcp) { return 0; } static inline unsigned int cpumask_last(const struct cpumask *srcp) { return 0; } /* Valid inputs for n are -1 and 0. */ static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { return n+1; } static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { return n+1; } static inline unsigned int cpumask_next_and(int n, const struct cpumask *srcp, const struct cpumask *andp) { return n+1; } static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { /* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */ return (wrap && n == 0); } /* cpu must be a valid cpu, ie 0, so there's no other choice. */ static inline unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) { return 1; } static inline unsigned int cpumask_local_spread(unsigned int i, int node) { return 0; } static inline int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { return cpumask_next_and(-1, src1p, src2p); } #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) #define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) #else /** * cpumask_first - get the first cpu in a cpumask * @srcp: the cpumask pointer * * Returns >= nr_cpu_ids if no cpus set. */ static inline unsigned int cpumask_first(const struct cpumask *srcp) { return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer * * Returns >= nr_cpumask_bits if no CPUs set. */ static inline unsigned int cpumask_last(const struct cpumask *srcp) { return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); } unsigned int cpumask_next(int n, const struct cpumask *srcp); /** * cpumask_next_zero - get the next unset cpu in a cpumask * @n: the cpu prior to the place to search (ie. return will be > @n) * @srcp: the cpumask pointer * * Returns >= nr_cpu_ids if no further cpus unset. */ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); } int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); unsigned int cpumask_local_spread(unsigned int i, int node); int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); /** * for_each_cpu - iterate over every cpu in a mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu(cpu, mask) \ for ((cpu) = -1; \ (cpu) = cpumask_next((cpu), (mask)), \ (cpu) < nr_cpu_ids;) /** * for_each_cpu_not - iterate over every cpu in a complemented mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_not(cpu, mask) \ for ((cpu) = -1; \ (cpu) = cpumask_next_zero((cpu), (mask)), \ (cpu) < nr_cpu_ids;) extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask poiter * @start: the start location * * The implementation does not assume any bit in @mask is set (including @start). * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \ (cpu) < nr_cpumask_bits; \ (cpu) = cpumask_next_wrap((cpu), (mask), (start), true)) /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = -1; \ (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \ (cpu) < nr_cpu_ids;) #endif /* SMP */ #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } #define CPU_BITS_CPU0 \ { \ [0] = 1UL \ } /** * cpumask_set_cpu - set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) { clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) { __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in @cpumask, else returns 0 */ static inline int cpumask_test_cpu(int cpu, const struct cpumask *cpumask) { return test_bit(cpumask_check(cpu), cpumask_bits((cpumask))); } /** * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0 * * test_and_set_bit wrapper for cpumasks. */ static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) { return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0 * * test_and_clear_bit wrapper for cpumasks. */ static inline int cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask) { return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static inline void cpumask_setall(struct cpumask *dstp) { bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static inline void cpumask_clear(struct cpumask *dstp) { bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_and - *dstp = *src1p & *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * If *@dstp is empty, returns 0, else returns 1 */ static inline int cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_or - *dstp = *src1p | *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static inline void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_andnot - *dstp = *src1p & ~*src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * If *@dstp is empty, returns 0, else returns 1 */ static inline int cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_complement - *dstp = ~*srcp * @dstp: the cpumask result * @srcp: the input to invert */ static inline void cpumask_complement(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask