1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOCONTEXT_H #define IOCONTEXT_H #include <linux/radix-tree.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> enum { ICQ_EXITED = 1 << 2, ICQ_DESTROYED = 1 << 3, }; /* * An io_cq (icq) is association between an io_context (ioc) and a * request_queue (q). This is used by elevators which need to track * information per ioc - q pair. * * Elevator can request use of icq by setting elevator_type->icq_size and * ->icq_align. Both size and align must be larger than that of struct * io_cq and elevator can use the tail area for private information. The * recommended way to do this is defining a struct which contains io_cq as * the first member followed by private members and using its size and * align. For example, * * struct snail_io_cq { * struct io_cq icq; * int poke_snail; * int feed_snail; * }; * * struct elevator_type snail_elv_type { * .ops = { ... }, * .icq_size = sizeof(struct snail_io_cq), * .icq_align = __alignof__(struct snail_io_cq), * ... * }; * * If icq_size is set, block core will manage icq's. All requests will * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn() * is called and be holding a reference to the associated io_context. * * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is * called and, on destruction, ->elevator_exit_icq_fn(). Both functions * are called with both the associated io_context and queue locks held. * * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding * queue lock but the returned icq is valid only until the queue lock is * released. Elevators can not and should not try to create or destroy * icq's. * * As icq's are linked from both ioc and q, the locking rules are a bit * complex. * * - ioc lock nests inside q lock. * * - ioc->icq_list and icq->ioc_node are protected by ioc lock. * q->icq_list and icq->q_node by q lock. * * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq * itself is protected by q lock. However, both the indexes and icq * itself are also RCU managed and lookup can be performed holding only * the q lock. * * - icq's are not reference counted. They are destroyed when either the * ioc or q goes away. Each request with icq set holds an extra * reference to ioc to ensure it stays until the request is completed. * * - Linking and unlinking icq's are performed while holding both ioc and q * locks. Due to the lock ordering, q exit is simple but ioc exit * requires reverse-order double lock dance. */ struct io_cq { struct request_queue *q; struct io_context *ioc; /* * q_node and ioc_node link io_cq through icq_list of q and ioc * respectively. Both fields are unused once ioc_exit_icq() is * called and shared with __rcu_icq_cache and __rcu_head which are * used for RCU free of io_cq. */ union { struct list_head q_node; struct kmem_cache *__rcu_icq_cache; }; union { struct hlist_node ioc_node; struct rcu_head __rcu_head; }; unsigned int flags; }; /* * I/O subsystem state of the associated processes. It is refcounted * and kmalloc'ed. These could be shared between processes. */ struct io_context { atomic_long_t refcount; atomic_t active_ref; atomic_t nr_tasks; /* all the fields below are protected by this lock */ spinlock_t lock; unsigned short ioprio; struct radix_tree_root icq_tree; struct io_cq __rcu *icq_hint; struct hlist_head icq_list; struct work_struct release_work; }; /** * get_io_context_active - get active reference on ioc * @ioc: ioc of interest * * Only iocs with active reference can issue new IOs. This function * acquires an active reference on @ioc. The caller must already have an * active reference on @ioc. */ static inline void get_io_context_active(struct io_context *ioc) { WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0); WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0); atomic_long_inc(&ioc->refcount); atomic_inc(&ioc->active_ref); } static inline void ioc_task_link(struct io_context *ioc) { get_io_context_active(ioc); WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); atomic_inc(&ioc->nr_tasks); } struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void put_io_context_active(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); #else struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM libata #if !defined(_TRACE_LIBATA_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_LIBATA_H #include <linux/ata.h> #include <linux/libata.h> #include <linux/tracepoint.h> #include <linux/trace_seq.h> #define ata_opcode_name(opcode) { opcode, #opcode } #define show_opcode_name(val) \ __print_symbolic(val, \ ata_opcode_name(ATA_CMD_DEV_RESET), \ ata_opcode_name(ATA_CMD_CHK_POWER), \ ata_opcode_name(ATA_CMD_STANDBY), \ ata_opcode_name(ATA_CMD_IDLE), \ ata_opcode_name(ATA_CMD_EDD), \ ata_opcode_name(ATA_CMD_DOWNLOAD_MICRO), \ ata_opcode_name(ATA_CMD_DOWNLOAD_MICRO_DMA), \ ata_opcode_name(ATA_CMD_NOP), \ ata_opcode_name(ATA_CMD_FLUSH), \ ata_opcode_name(ATA_CMD_FLUSH_EXT), \ ata_opcode_name(ATA_CMD_ID_ATA), \ ata_opcode_name(ATA_CMD_ID_ATAPI), \ ata_opcode_name(ATA_CMD_SERVICE), \ ata_opcode_name(ATA_CMD_READ), \ ata_opcode_name(ATA_CMD_READ_EXT), \ ata_opcode_name(ATA_CMD_READ_QUEUED), \ ata_opcode_name(ATA_CMD_READ_STREAM_EXT), \ ata_opcode_name(ATA_CMD_READ_STREAM_DMA_EXT), \ ata_opcode_name(ATA_CMD_WRITE), \ ata_opcode_name(ATA_CMD_WRITE_EXT), \ ata_opcode_name(ATA_CMD_WRITE_QUEUED), \ ata_opcode_name(ATA_CMD_WRITE_STREAM_EXT), \ ata_opcode_name(ATA_CMD_WRITE_STREAM_DMA_EXT), \ ata_opcode_name(ATA_CMD_WRITE_FUA_EXT), \ ata_opcode_name(ATA_CMD_WRITE_QUEUED_FUA_EXT), \ ata_opcode_name(ATA_CMD_FPDMA_READ), \ ata_opcode_name(ATA_CMD_FPDMA_WRITE), \ ata_opcode_name(ATA_CMD_NCQ_NON_DATA), \ ata_opcode_name(ATA_CMD_FPDMA_SEND), \ ata_opcode_name(ATA_CMD_FPDMA_RECV), \ ata_opcode_name(ATA_CMD_PIO_READ), \ ata_opcode_name(ATA_CMD_PIO_READ_EXT), \ ata_opcode_name(ATA_CMD_PIO_WRITE), \ ata_opcode_name(ATA_CMD_PIO_WRITE_EXT), \ ata_opcode_name(ATA_CMD_READ_MULTI), \ ata_opcode_name(ATA_CMD_READ_MULTI_EXT), \ ata_opcode_name(ATA_CMD_WRITE_MULTI), \ ata_opcode_name(ATA_CMD_WRITE_MULTI_EXT), \ ata_opcode_name(ATA_CMD_WRITE_MULTI_FUA_EXT), \ ata_opcode_name(ATA_CMD_SET_FEATURES), \ ata_opcode_name(ATA_CMD_SET_MULTI), \ ata_opcode_name(ATA_CMD_PACKET), \ ata_opcode_name(ATA_CMD_VERIFY), \ ata_opcode_name(ATA_CMD_VERIFY_EXT), \ ata_opcode_name(ATA_CMD_WRITE_UNCORR_EXT), \ ata_opcode_name(ATA_CMD_STANDBYNOW1), \ ata_opcode_name(ATA_CMD_IDLEIMMEDIATE), \ ata_opcode_name(ATA_CMD_SLEEP), \ ata_opcode_name(ATA_CMD_INIT_DEV_PARAMS), \ ata_opcode_name(ATA_CMD_READ_NATIVE_MAX), \ ata_opcode_name(ATA_CMD_READ_NATIVE_MAX_EXT), \ ata_opcode_name(ATA_CMD_SET_MAX), \ ata_opcode_name(ATA_CMD_SET_MAX_EXT), \ ata_opcode_name(ATA_CMD_READ_LOG_EXT), \ ata_opcode_name(ATA_CMD_WRITE_LOG_EXT), \ ata_opcode_name(ATA_CMD_READ_LOG_DMA_EXT), \ ata_opcode_name(ATA_CMD_WRITE_LOG_DMA_EXT), \ ata_opcode_name(ATA_CMD_TRUSTED_NONDATA), \ ata_opcode_name(ATA_CMD_TRUSTED_RCV), \ ata_opcode_name(ATA_CMD_TRUSTED_RCV_DMA), \ ata_opcode_name(ATA_CMD_TRUSTED_SND), \ ata_opcode_name(ATA_CMD_TRUSTED_SND_DMA), \ ata_opcode_name(ATA_CMD_PMP_READ), \ ata_opcode_name(ATA_CMD_PMP_READ_DMA), \ ata_opcode_name(ATA_CMD_PMP_WRITE), \ ata_opcode_name(ATA_CMD_PMP_WRITE_DMA), \ ata_opcode_name(ATA_CMD_CONF_OVERLAY), \ ata_opcode_name(ATA_CMD_SEC_SET_PASS), \ ata_opcode_name(ATA_CMD_SEC_UNLOCK), \ ata_opcode_name(ATA_CMD_SEC_ERASE_PREP), \ ata_opcode_name(ATA_CMD_SEC_ERASE_UNIT), \ ata_opcode_name(ATA_CMD_SEC_FREEZE_LOCK), \ ata_opcode_name(ATA_CMD_SEC_DISABLE_PASS), \ ata_opcode_name(ATA_CMD_CONFIG_STREAM), \ ata_opcode_name(ATA_CMD_SMART), \ ata_opcode_name(ATA_CMD_MEDIA_LOCK), \ ata_opcode_name(ATA_CMD_MEDIA_UNLOCK), \ ata_opcode_name(ATA_CMD_DSM), \ ata_opcode_name(ATA_CMD_CHK_MED_CRD_TYP), \ ata_opcode_name(ATA_CMD_CFA_REQ_EXT_ERR), \ ata_opcode_name(ATA_CMD_CFA_WRITE_NE), \ ata_opcode_name(ATA_CMD_CFA_TRANS_SECT), \ ata_opcode_name(ATA_CMD_CFA_ERASE), \ ata_opcode_name(ATA_CMD_CFA_WRITE_MULT_NE), \ ata_opcode_name(ATA_CMD_REQ_SENSE_DATA), \ ata_opcode_name(ATA_CMD_SANITIZE_DEVICE), \ ata_opcode_name(ATA_CMD_ZAC_MGMT_IN), \ ata_opcode_name(ATA_CMD_ZAC_MGMT_OUT), \ ata_opcode_name(ATA_CMD_RESTORE), \ ata_opcode_name(ATA_CMD_READ_LONG), \ ata_opcode_name(ATA_CMD_READ_LONG_ONCE), \ ata_opcode_name(ATA_CMD_WRITE_LONG), \ ata_opcode_name(ATA_CMD_WRITE_LONG_ONCE)) #define ata_error_name(result) { result, #result } #define show_error_name(val) \ __print_symbolic(val, \ ata_error_name(ATA_ICRC), \ ata_error_name(ATA_UNC), \ ata_error_name(ATA_MC), \ ata_error_name(ATA_IDNF), \ ata_error_name(ATA_MCR), \ ata_error_name(ATA_ABORTED), \ ata_error_name(ATA_TRK0NF), \ ata_error_name(ATA_AMNF)) #define ata_protocol_name(proto) { proto, #proto } #define show_protocol_name(val) \ __print_symbolic(val, \ ata_protocol_name(ATA_PROT_UNKNOWN), \ ata_protocol_name(ATA_PROT_NODATA), \ ata_protocol_name(ATA_PROT_PIO), \ ata_protocol_name(ATA_PROT_DMA), \ ata_protocol_name(ATA_PROT_NCQ), \ ata_protocol_name(ATA_PROT_NCQ_NODATA), \ ata_protocol_name(ATAPI_PROT_NODATA), \ ata_protocol_name(ATAPI_PROT_PIO), \ ata_protocol_name(ATAPI_PROT_DMA)) const char *libata_trace_parse_status(struct trace_seq*, unsigned char); #define __parse_status(s) libata_trace_parse_status(p, s) const char *libata_trace_parse_eh_action(struct trace_seq *, unsigned int); #define __parse_eh_action(a) libata_trace_parse_eh_action(p, a) const char *libata_trace_parse_eh_err_mask(struct trace_seq *, unsigned int); #define __parse_eh_err_mask(m) libata_trace_parse_eh_err_mask(p, m) const char *libata_trace_parse_qc_flags(struct trace_seq *, unsigned int); #define __parse_qc_flags(f) libata_trace_parse_qc_flags(p, f) const char *libata_trace_parse_subcmd(struct trace_seq *, unsigned char, unsigned char, unsigned char); #define __parse_subcmd(c,f,h) libata_trace_parse_subcmd(p, c, f, h) TRACE_EVENT(ata_qc_issue, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc), TP_STRUCT__entry( __field( unsigned int, ata_port ) __field( unsigned int, ata_dev ) __field( unsigned int, tag ) __field( unsigned char, cmd ) __field( unsigned char, dev ) __field( unsigned char, lbal ) __field( unsigned char, lbam ) __field( unsigned char, lbah ) __field( unsigned char, nsect ) __field( unsigned char, feature ) __field( unsigned char, hob_lbal ) __field( unsigned char, hob_lbam ) __field( unsigned char, hob_lbah ) __field( unsigned char, hob_nsect ) __field( unsigned char, hob_feature ) __field( unsigned char, ctl ) __field( unsigned char, proto ) __field( unsigned long, flags ) ), TP_fast_assign( __entry->ata_port = qc->ap->print_id; __entry->ata_dev = qc->dev->link->pmp + qc->dev->devno; __entry->tag = qc->tag; __entry->proto = qc->tf.protocol; __entry->cmd = qc->tf.command; __entry->dev = qc->tf.device; __entry->lbal = qc->tf.lbal; __entry->lbam = qc->tf.lbam; __entry->lbah = qc->tf.lbah; __entry->hob_lbal = qc->tf.hob_lbal; __entry->hob_lbam = qc->tf.hob_lbam; __entry->hob_lbah = qc->tf.hob_lbah; __entry->feature = qc->tf.feature; __entry->hob_feature = qc->tf.hob_feature; __entry->nsect = qc->tf.nsect; __entry->hob_nsect = qc->tf.hob_nsect; ), TP_printk("ata_port=%u ata_dev=%u tag=%d proto=%s cmd=%s%s " \ " tf=(%02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x)", __entry->ata_port, __entry->ata_dev, __entry->tag, show_protocol_name(__entry->proto), show_opcode_name(__entry->cmd), __parse_subcmd(__entry->cmd, __entry->feature, __entry->hob_nsect), __entry->cmd, __entry->feature, __entry->nsect, __entry->lbal, __entry->lbam, __entry->lbah, __entry->hob_feature, __entry->hob_nsect, __entry->hob_lbal, __entry->hob_lbam, __entry->hob_lbah, __entry->dev) ); DECLARE_EVENT_CLASS(ata_qc_complete_template, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc), TP_STRUCT__entry( __field( unsigned int, ata_port ) __field( unsigned int, ata_dev ) __field( unsigned int, tag ) __field( unsigned char, status ) __field( unsigned char, dev ) __field( unsigned char, lbal ) __field( unsigned char, lbam ) __field( unsigned char, lbah ) __field( unsigned char, nsect ) __field( unsigned char, error ) __field( unsigned char, hob_lbal ) __field( unsigned char, hob_lbam ) __field( unsigned char, hob_lbah ) __field( unsigned char, hob_nsect ) __field( unsigned char, hob_feature ) __field( unsigned char, ctl ) __field( unsigned long, flags ) ), TP_fast_assign( __entry->ata_port = qc->ap->print_id; __entry->ata_dev = qc->dev->link->pmp + qc->dev->devno; __entry->tag = qc->tag; __entry->status = qc->result_tf.command; __entry->dev = qc->result_tf.device; __entry->lbal = qc->result_tf.lbal; __entry->lbam = qc->result_tf.lbam; __entry->lbah = qc->result_tf.lbah; __entry->hob_lbal = qc->result_tf.hob_lbal; __entry->hob_lbam = qc->result_tf.hob_lbam; __entry->hob_lbah = qc->result_tf.hob_lbah; __entry->error = qc->result_tf.feature; __entry->hob_feature = qc->result_tf.hob_feature; __entry->nsect = qc->result_tf.nsect; __entry->hob_nsect = qc->result_tf.hob_nsect; ), TP_printk("ata_port=%u ata_dev=%u tag=%d flags=%s status=%s " \ " res=(%02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x)", __entry->ata_port, __entry->ata_dev, __entry->tag, __parse_qc_flags(__entry->flags), __parse_status(__entry->status), __entry->status, __entry->error, __entry->nsect, __entry->lbal, __entry->lbam, __entry->lbah, __entry->hob_feature, __entry->hob_nsect, __entry->hob_lbal, __entry->hob_lbam, __entry->hob_lbah, __entry->dev) ); DEFINE_EVENT(ata_qc_complete_template, ata_qc_complete_internal, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc)); DEFINE_EVENT(ata_qc_complete_template, ata_qc_complete_failed, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc)); DEFINE_EVENT(ata_qc_complete_template, ata_qc_complete_done, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc)); TRACE_EVENT(ata_eh_link_autopsy, TP_PROTO(struct ata_device *dev, unsigned int eh_action, unsigned int eh_err_mask), TP_ARGS(dev, eh_action, eh_err_mask), TP_STRUCT__entry( __field( unsigned int, ata_port ) __field( unsigned int, ata_dev ) __field( unsigned int, eh_action ) __field( unsigned int, eh_err_mask) ), TP_fast_assign( __entry->ata_port = dev->link->ap->print_id; __entry->ata_dev = dev->link->pmp + dev->devno; __entry->eh_action = eh_action; __entry->eh_err_mask = eh_err_mask; ), TP_printk("ata_port=%u ata_dev=%u eh_action=%s err_mask=%s", __entry->ata_port, __entry->ata_dev, __parse_eh_action(__entry->eh_action), __parse_eh_err_mask(__entry->eh_err_mask)) ); TRACE_EVENT(ata_eh_link_autopsy_qc, TP_PROTO(struct ata_queued_cmd *qc), TP_ARGS(qc), TP_STRUCT__entry( __field( unsigned int, ata_port ) __field( unsigned int, ata_dev ) __field( unsigned int, tag ) __field( unsigned int, qc_flags ) __field( unsigned int, eh_err_mask) ), TP_fast_assign( __entry->ata_port = qc->ap->print_id; __entry->ata_dev = qc->dev->link->pmp + qc->dev->devno; __entry->tag = qc->tag; __entry->qc_flags = qc->flags; __entry->eh_err_mask = qc->err_mask; ), TP_printk("ata_port=%u ata_dev=%u tag=%d flags=%s err_mask=%s", __entry->ata_port, __entry->ata_dev, __entry->tag, __parse_qc_flags(__entry->qc_flags), __parse_eh_err_mask(__entry->eh_err_mask)) ); #endif /* _TRACE_LIBATA_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 #undef TRACE_SYSTEM #define TRACE_SYSTEM rtc #if !defined(_TRACE_RTC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RTC_H #include <linux/rtc.h> #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(rtc_time_alarm_class, TP_PROTO(time64_t secs, int err), TP_ARGS(secs, err), TP_STRUCT__entry( __field(time64_t, secs) __field(int, err) ), TP_fast_assign( __entry->secs = secs; __entry->err = err; ), TP_printk("UTC (%lld) (%d)", __entry->secs, __entry->err ) ); DEFINE_EVENT(rtc_time_alarm_class, rtc_set_time, TP_PROTO(time64_t secs, int err), TP_ARGS(secs, err) ); DEFINE_EVENT(rtc_time_alarm_class, rtc_read_time, TP_PROTO(time64_t secs, int err), TP_ARGS(secs, err) ); DEFINE_EVENT(rtc_time_alarm_class, rtc_set_alarm, TP_PROTO(time64_t secs, int err), TP_ARGS(secs, err) ); DEFINE_EVENT(rtc_time_alarm_class, rtc_read_alarm, TP_PROTO(time64_t secs, int err), TP_ARGS(secs, err) ); TRACE_EVENT(rtc_irq_set_freq, TP_PROTO(int freq, int err), TP_ARGS(freq, err), TP_STRUCT__entry( __field(int, freq) __field(int, err) ), TP_fast_assign( __entry->freq = freq; __entry->err = err; ), TP_printk("set RTC periodic IRQ frequency:%u (%d)", __entry->freq, __entry->err ) ); TRACE_EVENT(rtc_irq_set_state, TP_PROTO(int enabled, int err), TP_ARGS(enabled, err), TP_STRUCT__entry( __field(int, enabled) __field(int, err) ), TP_fast_assign( __entry->enabled = enabled; __entry->err = err; ), TP_printk("%s RTC 2^N Hz periodic IRQs (%d)", __entry->enabled ? "enable" : "disable", __entry->err ) ); TRACE_EVENT(rtc_alarm_irq_enable, TP_PROTO(unsigned int enabled, int err), TP_ARGS(enabled, err), TP_STRUCT__entry( __field(unsigned int, enabled) __field(int, err) ), TP_fast_assign( __entry->enabled = enabled; __entry->err = err; ), TP_printk("%s RTC alarm IRQ (%d)", __entry->enabled ? "enable" : "disable", __entry->err ) ); DECLARE_EVENT_CLASS(rtc_offset_class, TP_PROTO(long offset, int err), TP_ARGS(offset, err), TP_STRUCT__entry( __field(long, offset) __field(int, err) ), TP_fast_assign( __entry->offset = offset; __entry->err = err; ), TP_printk("RTC offset: %ld (%d)", __entry->offset, __entry->err ) ); DEFINE_EVENT(rtc_offset_class, rtc_set_offset, TP_PROTO(long offset, int err), TP_ARGS(offset, err) ); DEFINE_EVENT(rtc_offset_class, rtc_read_offset, TP_PROTO(long offset, int err), TP_ARGS(offset, err) ); DECLARE_EVENT_CLASS(rtc_timer_class, TP_PROTO(struct rtc_timer *timer), TP_ARGS(timer), TP_STRUCT__entry( __field(struct rtc_timer *, timer) __field(ktime_t, expires) __field(ktime_t, period) ), TP_fast_assign( __entry->timer = timer; __entry->expires = timer->node.expires; __entry->period = timer->period; ), TP_printk("RTC timer:(%p) expires:%lld period:%lld", __entry->timer, __entry->expires, __entry->period ) ); DEFINE_EVENT(rtc_timer_class, rtc_timer_enqueue, TP_PROTO(struct rtc_timer *timer), TP_ARGS(timer) ); DEFINE_EVENT(rtc_timer_class, rtc_timer_dequeue, TP_PROTO(struct rtc_timer *timer), TP_ARGS(timer) ); DEFINE_EVENT(rtc_timer_class, rtc_timer_fired, TP_PROTO(struct rtc_timer *timer), TP_ARGS(timer) ); #endif /* _TRACE_RTC_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM io_uring #if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IO_URING_H #include <linux/tracepoint.h> struct io_wq_work; /** * io_uring_create - called after a new io_uring context was prepared * * @fd: corresponding file descriptor * @ctx: pointer to a ring context structure * @sq_entries: actual SQ size * @cq_entries: actual CQ size * @flags: SQ ring flags, provided to io_uring_setup(2) * * Allows to trace io_uring creation and provide pointer to a context, that can * be used later to find correlated events. */ TRACE_EVENT(io_uring_create, TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags), TP_ARGS(fd, ctx, sq_entries, cq_entries, flags), TP_STRUCT__entry ( __field( int, fd ) __field( void *, ctx ) __field( u32, sq_entries ) __field( u32, cq_entries ) __field( u32, flags ) ), TP_fast_assign( __entry->fd = fd; __entry->ctx = ctx; __entry->sq_entries = sq_entries; __entry->cq_entries = cq_entries; __entry->flags = flags; ), TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d", __entry->ctx, __entry->fd, __entry->sq_entries, __entry->cq_entries, __entry->flags) ); /** * io_uring_register - called after a buffer/file/eventfd was succesfully * registered for a ring * * @ctx: pointer to a ring context structure * @opcode: describes which operation to perform * @nr_user_files: number of registered files * @nr_user_bufs: number of registered buffers * @cq_ev_fd: whether eventfs registered or not * @ret: return code * * Allows to trace fixed files/buffers/eventfds, that could be registered to * avoid an overhead of getting references to them for every operation. This * event, together with io_uring_file_get, can provide a full picture of how * much overhead one can reduce via fixing. */ TRACE_EVENT(io_uring_register, TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files, unsigned nr_bufs, bool eventfd, long ret), TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret), TP_STRUCT__entry ( __field( void *, ctx ) __field( unsigned, opcode ) __field( unsigned, nr_files ) __field( unsigned, nr_bufs ) __field( bool, eventfd ) __field( long, ret ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->nr_files = nr_files; __entry->nr_bufs = nr_bufs; __entry->eventfd = eventfd; __entry->ret = ret; ), TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, " "eventfd %d, ret %ld", __entry->ctx, __entry->opcode, __entry->nr_files, __entry->nr_bufs, __entry->eventfd, __entry->ret) ); /** * io_uring_file_get - called before getting references to an SQE file * * @ctx: pointer to a ring context structure * @fd: SQE file descriptor * * Allows to trace out how often an SQE file reference is obtained, which can * help figuring out if it makes sense to use fixed files, or check that fixed * files are used correctly. */ TRACE_EVENT(io_uring_file_get, TP_PROTO(void *ctx, int fd), TP_ARGS(ctx, fd), TP_STRUCT__entry ( __field( void *, ctx ) __field( int, fd ) ), TP_fast_assign( __entry->ctx = ctx; __entry->fd = fd; ), TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd) ); /** * io_uring_queue_async_work - called before submitting a new async work * * @ctx: pointer to a ring context structure * @hashed: type of workqueue, hashed or normal * @req: pointer to a submitted request * @work: pointer to a submitted io_wq_work * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work, unsigned int flags), TP_ARGS(ctx, rw, req, work, flags), TP_STRUCT__entry ( __field( void *, ctx ) __field( int, rw ) __field( void *, req ) __field( struct io_wq_work *, work ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->ctx = ctx; __entry->rw = rw; __entry->req = req; __entry->work = work; __entry->flags = flags; ), TP_printk("ring %p, request %p, flags %d, %s queue, work %p", __entry->ctx, __entry->req, __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) ); /** * io_uring_defer - called when an io_uring request is deferred * * @ctx: pointer to a ring context structure * @req: pointer to a deferred request * @user_data: user data associated with the request * * Allows to track deferred requests, to get an insight about what requests are * not started immediately. */ TRACE_EVENT(io_uring_defer, TP_PROTO(void *ctx, void *req, unsigned long long user_data), TP_ARGS(ctx, req, user_data), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, data ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; __entry->data = user_data; ), TP_printk("ring %p, request %p user_data %llu", __entry->ctx, __entry->req, __entry->data) ); /** * io_uring_link - called before the io_uring request added into link_list of * another request * * @ctx: pointer to a ring context structure * @req: pointer to a linked request * @target_req: pointer to a previous request, that would contain @req * * Allows to track linked requests, to understand dependencies between requests * and how does it influence their execution flow. */ TRACE_EVENT(io_uring_link, TP_PROTO(void *ctx, void *req, void *target_req), TP_ARGS(ctx, req, target_req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( void *, target_req ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; __entry->target_req = target_req; ), TP_printk("ring %p, request %p linked after %p", __entry->ctx, __entry->req, __entry->target_req) ); /** * io_uring_cqring_wait - called before start waiting for an available CQE * * @ctx: pointer to a ring context structure * @min_events: minimal number of events to wait for * * Allows to track waiting for CQE, so that we can e.g. troubleshoot * situations, when an application wants to wait for an event, that never * comes. */ TRACE_EVENT(io_uring_cqring_wait, TP_PROTO(void *ctx, int min_events), TP_ARGS(ctx, min_events), TP_STRUCT__entry ( __field( void *, ctx ) __field( int, min_events ) ), TP_fast_assign( __entry->ctx = ctx; __entry->min_events = min_events; ), TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events) ); /** * io_uring_fail_link - called before failing a linked request * * @req: request, which links were cancelled * @link: cancelled link * * Allows to track linked requests cancellation, to see not only that some work * was cancelled, but also which request was the reason. */ TRACE_EVENT(io_uring_fail_link, TP_PROTO(void *req, void *link), TP_ARGS(req, link), TP_STRUCT__entry ( __field( void *, req ) __field( void *, link ) ), TP_fast_assign( __entry->req = req; __entry->link = link; ), TP_printk("request %p, link %p", __entry->req, __entry->link) ); /** * io_uring_complete - called when completing an SQE * * @ctx: pointer to a ring context structure * @user_data: user data associated with the request * @res: result of the request * */ TRACE_EVENT(io_uring_complete, TP_PROTO(void *ctx, u64 user_data, long res), TP_ARGS(ctx, user_data, res), TP_STRUCT__entry ( __field( void *, ctx ) __field( u64, user_data ) __field( long, res ) ), TP_fast_assign( __entry->ctx = ctx; __entry->user_data = user_data; __entry->res = res; ), TP_printk("ring %p, user_data 0x%llx, result %ld", __entry->ctx, (unsigned long long)__entry->user_data, __entry->res) ); /** * io_uring_submit_sqe - called before submitting one SQE * * @ctx: pointer to a ring context structure * @opcode: opcode of request * @user_data: user data associated with the request * @force_nonblock: whether a context blocking or not * @sq_thread: true if sq_thread has submitted this SQE * * Allows to track SQE submitting, to understand what was the source of it, SQ * thread or io_uring_enter call. */ TRACE_EVENT(io_uring_submit_sqe, TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock, bool sq_thread), TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread), TP_STRUCT__entry ( __field( void *, ctx ) __field( u8, opcode ) __field( u64, user_data ) __field( bool, force_nonblock ) __field( bool, sq_thread ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->user_data = user_data; __entry->force_nonblock = force_nonblock; __entry->sq_thread = sq_thread; ), TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d", __entry->ctx, __entry->opcode, (unsigned long long) __entry->user_data, __entry->force_nonblock, __entry->sq_thread) ); TRACE_EVENT(io_uring_poll_arm, TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events), TP_ARGS(ctx, opcode, user_data, mask, events), TP_STRUCT__entry ( __field( void *, ctx ) __field( u8, opcode ) __field( u64, user_data ) __field( int, mask ) __field( int, events ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->user_data = user_data; __entry->mask = mask; __entry->events = events; ), TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x", __entry->ctx, __entry->opcode, (unsigned long long) __entry->user_data, __entry->mask, __entry->events) ); TRACE_EVENT(io_uring_poll_wake, TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), TP_ARGS(ctx, opcode, user_data, mask), TP_STRUCT__entry ( __field( void *, ctx ) __field( u8, opcode ) __field( u64, user_data ) __field( int, mask ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->user_data = user_data; __entry->mask = mask; ), TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x", __entry->ctx, __entry->opcode, (unsigned long long) __entry->user_data, __entry->mask) ); TRACE_EVENT(io_uring_task_add, TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), TP_ARGS(ctx, opcode, user_data, mask), TP_STRUCT__entry ( __field( void *, ctx ) __field( u8, opcode ) __field( u64, user_data ) __field( int, mask ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->user_data = user_data; __entry->mask = mask; ), TP_printk("ring %p, op %d, data 0x%llx, mask %x", __entry->ctx, __entry->opcode, (unsigned long long) __entry->user_data, __entry->mask) ); TRACE_EVENT(io_uring_task_run, TP_PROTO(void *ctx, u8 opcode, u64 user_data), TP_ARGS(ctx, opcode, user_data), TP_STRUCT__entry ( __field( void *, ctx ) __field( u8, opcode ) __field( u64, user_data ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->user_data = user_data; ), TP_printk("ring %p, op %d, data 0x%llx", __entry->ctx, __entry->opcode, (unsigned long long) __entry->user_data) ); #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 #ifndef _LINUX_UNALIGNED_PACKED_STRUCT_H #define _LINUX_UNALIGNED_PACKED_STRUCT_H #include <linux/kernel.h> struct __una_u16 { u16 x; } __packed; struct __una_u32 { u32 x; } __packed; struct __una_u64 { u64 x; } __packed; static inline u16 __get_unaligned_cpu16(const void *p) { const struct __una_u16 *ptr = (const struct __una_u16 *)p; return ptr->x; } static inline u32 __get_unaligned_cpu32(const void *p) { const struct __una_u32 *ptr = (const struct __una_u32 *)p; return ptr->x; } static inline u64 __get_unaligned_cpu64(const void *p) { const struct __una_u64 *ptr = (const struct __una_u64 *)p; return ptr->x; } static inline void __put_unaligned_cpu16(u16 val, void *p) { struct __una_u16 *ptr = (struct __una_u16 *)p; ptr->x = val; } static inline void __put_unaligned_cpu32(u32 val, void *p) { struct __una_u32 *ptr = (struct __una_u32 *)p; ptr->x = val; } static inline void __put_unaligned_cpu64(u64 val, void *p) { struct __una_u64 *ptr = (struct __una_u64 *)p; ptr->x = val; } #endif /* _LINUX_UNALIGNED_PACKED_STRUCT_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2016 Qualcomm Atheros, Inc * * Based on net/sched/sch_fq_codel.c */ #ifndef __NET_SCHED_FQ_IMPL_H #define __NET_SCHED_FQ_IMPL_H #include <net/fq.h> /* functions that are embedded into includer */ static void fq_adjust_removal(struct fq *fq, struct fq_flow *flow, struct sk_buff *skb) { struct fq_tin *tin = flow->tin; tin->backlog_bytes -= skb->len; tin->backlog_packets--; flow->backlog -= skb->len; fq->backlog--; fq->memory_usage -= skb->truesize; } static void fq_rejigger_backlog(struct fq *fq, struct fq_flow *flow) { struct fq_flow *i; if (flow->backlog == 0) { list_del_init(&flow->backlogchain); } else { i = flow; list_for_each_entry_continue(i, &fq->backlogs, backlogchain) if (i->backlog < flow->backlog) break; list_move_tail(&flow->backlogchain, &i->backlogchain); } } static struct sk_buff *fq_flow_dequeue(struct fq *fq, struct fq_flow *flow) { struct sk_buff *skb; lockdep_assert_held(&fq->lock); skb = __skb_dequeue(&flow->queue); if (!skb) return NULL; fq_adjust_removal(fq, flow, skb); fq_rejigger_backlog(fq, flow); return skb; } static struct sk_buff *fq_tin_dequeue(struct fq *fq, struct fq_tin *tin, fq_tin_dequeue_t dequeue_func) { struct fq_flow *flow; struct list_head *head; struct sk_buff *skb; lockdep_assert_held(&fq->lock); begin: head = &tin->new_flows; if (list_empty(head)) { head = &tin->old_flows; if (list_empty(head)) return NULL; } flow = list_first_entry(head, struct fq_flow, flowchain); if (flow->deficit <= 0) { flow->deficit += fq->quantum; list_move_tail(&flow->flowchain, &tin->old_flows); goto begin; } skb = dequeue_func(fq, tin, flow); if (!skb) { /* force a pass through old_flows to prevent starvation */ if ((head == &tin->new_flows) && !list_empty(&tin->old_flows)) { list_move_tail(&flow->flowchain, &tin->old_flows); } else { list_del_init(&flow->flowchain); flow->tin = NULL; } goto begin; } flow->deficit -= skb->len; tin->tx_bytes += skb->len; tin->tx_packets++; return skb; } static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb) { u32 hash = skb_get_hash(skb); return reciprocal_scale(hash, fq->flows_cnt); } static struct fq_flow *fq_flow_classify(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb, fq_flow_get_default_t get_default_func) { struct fq_flow *flow; lockdep_assert_held(&fq->lock); flow = &fq->flows[idx]; if (flow->tin && flow->tin != tin) { flow = get_default_func(fq, tin, idx, skb); tin->collisions++; fq->collisions++; } if (!flow->tin) tin->flows++; return flow; } static void fq_recalc_backlog(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow) { struct fq_flow *i; if (list_empty(&flow->backlogchain)) list_add_tail(&flow->backlogchain, &fq->backlogs); i = flow; list_for_each_entry_continue_reverse(i, &fq->backlogs, backlogchain) if (i->backlog > flow->backlog) break; list_move(&flow->backlogchain, &i->backlogchain); } static void fq_tin_enqueue(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb, fq_skb_free_t free_func, fq_flow_get_default_t get_default_func) { struct fq_flow *flow; bool oom; lockdep_assert_held(&fq->lock); flow = fq_flow_classify(fq, tin, idx, skb, get_default_func); flow->tin = tin; flow->backlog += skb->len; tin->backlog_bytes += skb->len; tin->backlog_packets++; fq->memory_usage += skb->truesize; fq->backlog++; fq_recalc_backlog(fq, tin, flow); if (list_empty(&flow->flowchain)) { flow->deficit = fq->quantum; list_add_tail(&flow->flowchain, &tin->new_flows); } __skb_queue_tail(&flow->queue, skb); oom = (fq->memory_usage > fq->memory_limit); while (fq->backlog > fq->limit || oom) { flow = list_first_entry_or_null(&fq->backlogs, struct fq_flow, backlogchain); if (!flow) return; skb = fq_flow_dequeue(fq, flow); if (!skb) return; free_func(fq, flow->tin, flow, skb); flow->tin->overlimit++; fq->overlimit++; if (oom) { fq->overmemory++; oom = (fq->memory_usage > fq->memory_limit); } } } static void fq_flow_filter(struct fq *fq, struct fq_flow *flow, fq_skb_filter_t filter_func, void *filter_data, fq_skb_free_t free_func) { struct fq_tin *tin = flow->tin; struct sk_buff *skb, *tmp; lockdep_assert_held(&fq->lock); skb_queue_walk_safe(&flow->queue, skb, tmp) { if (!filter_func(fq, tin, flow, skb, filter_data)) continue; __skb_unlink(skb, &flow->queue); fq_adjust_removal(fq, flow, skb); free_func(fq, tin, flow, skb); } fq_rejigger_backlog(fq, flow); } static void fq_tin_filter(struct fq *fq, struct fq_tin *tin, fq_skb_filter_t filter_func, void *filter_data, fq_skb_free_t free_func) { struct fq_flow *flow; lockdep_assert_held(&fq->lock); list_for_each_entry(flow, &tin->new_flows, flowchain) fq_flow_filter(fq, flow, filter_func, filter_data, free_func); list_for_each_entry(flow, &tin->old_flows, flowchain) fq_flow_filter(fq, flow, filter_func, filter_data, free_func); } static void fq_flow_reset(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) { struct sk_buff *skb; while ((skb = fq_flow_dequeue(fq, flow))) free_func(fq, flow->tin, flow, skb); if (!list_empty(&flow->flowchain)) list_del_init(&flow->flowchain); if (!list_empty(&flow->backlogchain)) list_del_init(&flow->backlogchain); flow->tin = NULL; WARN_ON_ONCE(flow->backlog); } static void fq_tin_reset(struct fq *fq, struct fq_tin *tin, fq_skb_free_t free_func) { struct list_head *head; struct fq_flow *flow; for (;;) { head = &tin->new_flows; if (list_empty(head)) { head = &tin->old_flows; if (list_empty(head)) break; } flow = list_first_entry(head, struct fq_flow, flowchain); fq_flow_reset(fq, flow, free_func); } WARN_ON_ONCE(tin->backlog_bytes); WARN_ON_ONCE(tin->backlog_packets); } static void fq_flow_init(struct fq_flow *flow) { INIT_LIST_HEAD(&flow->flowchain); INIT_LIST_HEAD(&flow->backlogchain); __skb_queue_head_init(&flow->queue); } static void fq_tin_init(struct fq_tin *tin) { INIT_LIST_HEAD(&tin->new_flows); INIT_LIST_HEAD(&tin->old_flows); } static int fq_init(struct fq *fq, int flows_cnt) { int i; memset(fq, 0, sizeof(fq[0])); INIT_LIST_HEAD(&fq->backlogs); spin_lock_init(&fq->lock); fq->flows_cnt = max_t(u32, flows_cnt, 1); fq->quantum = 300; fq->limit = 8192; fq->memory_limit = 16 << 20; /* 16 MBytes */ fq->flows = kvcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL); if (!fq->flows) return -ENOMEM; for (i = 0; i < fq->flows_cnt; i++) fq_flow_init(&fq->flows[i]); return 0; } static void fq_reset(struct fq *fq, fq_skb_free_t free_func) { int i; for (i = 0; i < fq->flows_cnt; i++) fq_flow_reset(fq, &fq->flows[i], free_func); kvfree(fq->flows); fq->flows = NULL; } #endif
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_PREEMPT_H #define __ASM_PREEMPT_H #include <asm/rmwcc.h> #include <asm/percpu.h> #include <linux/thread_info.h> DECLARE_PER_CPU(int, __preempt_count); /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 /* * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such * that a decrement hitting 0 means we can and should reschedule. */ #define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED) /* * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users * that think a non-zero value indicates we cannot preempt. */ static __always_inline int preempt_count(void) { return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) { int old, new; do { old = raw_cpu_read_4(__preempt_count); new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); } /* * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* * We fold the NEED_RESCHED bit into the preempt count such that * preempt_enable() can decrement and test for needing to reschedule with a * single instruction. * * We invert the actual bit, so that when the decrement hits 0 we know we both * need to resched (the bit is cleared) and can resched (no preempt count). */ static __always_inline void set_preempt_need_resched(void) { raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); } /* * The various preempt_count add/sub methods */ static __always_inline void __preempt_count_add(int val) { raw_cpu_add_4(__preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { raw_cpu_add_4(__preempt_count, -val); } /* * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule * a decrement which hits zero means we have no preempt_count and should * reschedule. */ static __always_inline bool __preempt_count_dec_and_test(void) { return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); } /* * Returns true when we need to resched and can (barring IRQ state). */ static __always_inline bool should_resched(int preempt_offset) { return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); } #ifdef CONFIG_PREEMPTION extern asmlinkage void preempt_schedule_thunk(void); # define __preempt_schedule() \ asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule(void); extern asmlinkage void preempt_schedule_notrace_thunk(void); # define __preempt_schedule_notrace() \ asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule_notrace(void); #endif #endif /* __ASM_PREEMPT_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_DCACHE_H #define __LINUX_DCACHE_H #include <linux/atomic.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/rculist_bl.h> #include <linux/spinlock.h> #include <linux/seqlock.h> #include <linux/cache.h> #include <linux/rcupdate.h> #include <linux/lockref.h> #include <linux/stringhash.h> #include <linux/wait.h> struct path; struct vfsmount; /* * linux/include/linux/dcache.h * * Dirent cache data structures * * (C) Copyright 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ #define IS_ROOT(x) ((x) == (x)->d_parent) /* The hash is always the low bits of hash_len */ #ifdef __LITTLE_ENDIAN #define HASH_LEN_DECLARE u32 hash; u32 len #define bytemask_from_count(cnt) (~(~0ul << (cnt)*8)) #else #define HASH_LEN_DECLARE u32 len; u32 hash #define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8)) #endif /* * "quick string" -- eases parameter passing, but more importantly * saves "metadata" about the string (ie length and the hash). * * hash comes first so it snuggles against d_parent in the * dentry. */ struct qstr { union { struct { HASH_LEN_DECLARE; }; u64 hash_len; }; const unsigned char *name; }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } extern const struct qstr empty_name; extern const struct qstr slash_name; struct dentry_stat_t { long nr_dentry; long nr_unused; long age_limit; /* age in seconds */ long want_pages; /* pages requested by system */ long nr_negative; /* # of unused negative dentries */ long dummy; /* Reserved for future use */ }; extern struct dentry_stat_t dentry_stat; /* * Try to keep struct dentry aligned on 64 byte cachelines (this will * give reasonable cacheline footprint with larger lines without the * large memory footprint increase). */ #ifdef CONFIG_64BIT # define DNAME_INLINE_LEN 32 /* 192 bytes */ #else # ifdef CONFIG_SMP # define DNAME_INLINE_LEN 36 /* 128 bytes */ # else # define DNAME_INLINE_LEN 40 /* 128 bytes */ # endif #endif #define d_lock d_lockref.lock struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ seqcount_spinlock_t d_seq; /* per dentry seqlock */ struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ struct qstr d_name; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ /* Ref lookup also touches following */ struct lockref d_lockref; /* per-dentry lock and refcount */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ void *d_fsdata; /* fs-specific data */ union { struct list_head d_lru; /* LRU list */ wait_queue_head_t *d_wait; /* in-lookup ones only */ }; struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ /* * d_alias and d_rcu can share memory */ union { struct hlist_node d_alias; /* inode alias list */ struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; } __randomize_layout; /* * dentry->d_lock spinlock nesting subclasses: * * 0: normal * 1: nested */ enum dentry_d_lock_class { DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */ DENTRY_D_LOCK_NESTED }; struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); int (*d_init)(struct dentry *); void (*d_release)(struct dentry *); void (*d_prune)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *); } ____cacheline_aligned; /* * Locking rules for dentry_operations callbacks are to be found in * Documentation/filesystems/locking.rst. Keep it updated! * * FUrther descriptions are found in Documentation/filesystems/vfs.rst. * Keep it updated too! */ /* d_flags entries */ #define DCACHE_OP_HASH 0x00000001 #define DCACHE_OP_COMPARE 0x00000002 #define DCACHE_OP_REVALIDATE 0x00000004 #define DCACHE_OP_DELETE 0x00000008 #define DCACHE_OP_PRUNE 0x00000010 #define DCACHE_DISCONNECTED 0x00000020 /* This dentry is possibly not currently connected to the dcache tree, in * which case its parent will either be itself, or will have this flag as * well. nfsd will not use a dentry with this bit set, but will first * endeavour to clear the bit either by discovering that it is connected, * or by performing lookup operations. Any filesystem which supports * nfsd_operations MUST have a lookup function which, if it finds a * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that * dentry into place and return that dentry rather than the passed one, * typically using d_splice_alias. */ #define DCACHE_REFERENCED 0x00000040 /* Recently used, don't discard. */ #define DCACHE_DONTCACHE 0x00000080 /* Purge from memory on final dput() */ #define DCACHE_CANT_MOUNT 0x00000100 #define DCACHE_GENOCIDE 0x00000200 #define DCACHE_SHRINK_LIST 0x00000400 #define DCACHE_OP_WEAK_REVALIDATE 0x00000800 #define DCACHE_NFSFS_RENAMED 0x00001000 /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ #define DCACHE_COOKIE 0x00002000 /* For use by dcookie subsystem */ #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x00004000 /* Parent inode is watched by some fsnotify listener */ #define DCACHE_DENTRY_KILLED 0x00008000 #define DCACHE_MOUNTED 0x00010000 /* is a mountpoint */ #define DCACHE_NEED_AUTOMOUNT 0x00020000 /* handle automount on this dir */ #define DCACHE_MANAGE_TRANSIT 0x00040000 /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) #define DCACHE_LRU_LIST 0x00080000 #define DCACHE_ENTRY_TYPE 0x00700000 #define DCACHE_MISS_TYPE 0x00000000 /* Negative dentry (maybe fallthru to nowhere) */ #define DCACHE_WHITEOUT_TYPE 0x00100000 /* Whiteout dentry (stop pathwalk) */ #define DCACHE_DIRECTORY_TYPE 0x00200000 /* Normal directory */ #define DCACHE_AUTODIR_TYPE 0x00300000 /* Lookupless directory (presumed automount) */ #define DCACHE_REGULAR_TYPE 0x00400000 /* Regular file type (or fallthru to such) */ #define DCACHE_SPECIAL_TYPE 0x00500000 /* Other file type (or fallthru to such) */ #define DCACHE_SYMLINK_TYPE 0x00600000 /* Symlink (or fallthru to such) */ #define DCACHE_MAY_FREE 0x00800000 #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ #define DCACHE_NOKEY_NAME 0x02000000 /* Encrypted name encoded without key */ #define DCACHE_OP_REAL 0x04000000 #define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */ #define DCACHE_DENTRY_CURSOR 0x20000000 #define DCACHE_NORCU 0x40000000 /* No RCU delay for freeing */ extern seqlock_t rename_lock; /* * These are the low-level FS interfaces to the dcache.. */ extern void d_instantiate(struct dentry *, struct inode *); extern void d_instantiate_new(struct dentry *, struct inode *); extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *); extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op); /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, wait_queue_head_t *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern struct dentry * d_exact_alias(struct dentry *, struct inode *); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); extern void shrink_dcache_for_umount(struct super_block *); extern void d_invalidate(struct dentry *); /* only used at mount-time */ extern struct dentry * d_make_root(struct inode *); /* <clickety>-<click> the ramfs-type tree */ extern void d_genocide(struct dentry *); extern void d_tmpfile(struct dentry *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); /* test whether we have any submounts in a subdir tree */ extern int path_has_submounts(const struct path *); /* * This adds the entry to the hash queues. */ extern void d_rehash(struct dentry *); extern void d_add(struct dentry *, struct inode *); /* used for rename() and baskets */ extern void d_move(struct dentry *, struct dentry *); extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); /* appendix may either be NULL or be used for transname suffixes */ extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); static inline unsigned d_count(const struct dentry *dentry) { return dentry->d_lockref.count; } /* * helper function for dentry_operations.d_dname() members */ extern __printf(4, 5) char *dynamic_dname(struct dentry *, char *, int, const char *, ...); extern char *__d_path(const struct path *, const struct path *, char *, int); extern char *d_absolute_path(const struct path *, char *, int); extern char *d_path(const struct path *, char *, int); extern char *dentry_path_raw(struct dentry *, char *, int); extern char *dentry_path(struct dentry *, char *, int); /* Allocation counts.. */ /** * dget, dget_dlock - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a dentry or %NULL pointer increment the reference count * if appropriate and return the dentry. A dentry will not be * destroyed when it has references. */ static inline struct dentry *dget_dlock(struct dentry *dentry) { if (dentry) dentry->d_lockref.count++; return dentry; } static inline struct dentry *dget(struct dentry *dentry) { if (dentry) lockref_get(&dentry->d_lockref); return dentry; } extern struct dentry *dget_parent(struct dentry *dentry); /** * d_unhashed - is dentry hashed * @dentry: entry to check * * Returns true if the dentry passed is not currently hashed. */ static inline int d_unhashed(const struct dentry *dentry) { return hlist_bl_unhashed(&dentry->d_hash); } static inline int d_unlinked(const struct dentry *dentry) { return d_unhashed(dentry) && !IS_ROOT(dentry); } static inline int cant_mount(const struct dentry *dentry) { return (dentry->d_flags & DCACHE_CANT_MOUNT); } static inline void dont_mount(struct dentry *dentry) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_CANT_MOUNT; spin_unlock(&dentry->d_lock); } extern void __d_lookup_done(struct dentry *); static inline int d_in_lookup(const struct dentry *dentry) { return dentry->d_flags & DCACHE_PAR_LOOKUP; } static inline void d_lookup_done(struct dentry *dentry) { if (unlikely(d_in_lookup(dentry))) { spin_lock(&dentry->d_lock); __d_lookup_done(dentry); spin_unlock(&dentry->d_lock); } } extern void dput(struct dentry *); static inline bool d_managed(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MANAGED_DENTRY; } static inline bool d_mountpoint(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MOUNTED; } /* * Directory cache entry type accessor functions. */ static inline unsigned __d_entry_type(const struct dentry *dentry) { return dentry->d_flags & DCACHE_ENTRY_TYPE; } static inline bool d_is_miss(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_MISS_TYPE; } static inline bool d_is_whiteout(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE; } static inline bool d_can_lookup(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE; } static inline bool d_is_autodir(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE; } static inline bool d_is_dir(const struct dentry *dentry) { return d_can_lookup(dentry) || d_is_autodir(dentry); } static inline bool d_is_symlink(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE; } static inline bool d_is_reg(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE; } static inline bool d_is_special(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE; } static inline bool d_is_file(const struct dentry *dentry) { return d_is_reg(dentry) || d_is_special(dentry); } static inline bool d_is_negative(const struct dentry *dentry) { // TODO: check d_is_whiteout(dentry) also. return d_is_miss(dentry); } static inline bool d_flags_negative(unsigned flags) { return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE; } static inline bool d_is_positive(const struct dentry *dentry) { return !d_is_negative(dentry); } /** * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents either an absent name or a name that * doesn't map to an inode (ie. ->d_inode is NULL). The dentry could represent * a true miss, a whiteout that isn't represented by a 0,0 chardev or a * fallthrough marker in an opaque directory. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. (3) The dentry may have something attached to ->d_lower and the * type field of the flags may be set to something other than miss or whiteout. */ static inline bool d_really_is_negative(const struct dentry *dentry) { return dentry->d_inode == NULL; } /** * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents a name that maps to an inode * (ie. ->d_inode is not NULL). The dentry might still represent a whiteout if * that is represented on medium as a 0,0 chardev. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. */ static inline bool d_really_is_positive(const struct dentry *dentry) { return dentry->d_inode != NULL; } static inline int simple_positive(const struct dentry *dentry) { return d_really_is_positive(dentry) && !d_unhashed(dentry); } extern void d_set_fallthru(struct dentry *dentry); static inline bool d_is_fallthru(const struct dentry *dentry) { return dentry->d_flags & DCACHE_FALLTHRU; } extern int sysctl_vfs_cache_pressure; static inline unsigned long vfs_pressure_ratio(unsigned long val) { return mult_frac(val, sysctl_vfs_cache_pressure, 100); } /** * d_inode - Get the actual inode of this dentry * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode(const struct dentry *dentry) { return dentry->d_inode; } /** * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE() * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode_rcu(const struct dentry *dentry) { return READ_ONCE(dentry->d_inode); } /** * d_backing_inode - Get upper or lower inode we should be using * @upper: The upper layer * * This is the helper that should be used to get at the inode that will be used * if this dentry were to be opened as a file. The inode may be on the upper * dentry or it may be on a lower dentry pinned by the upper. * * Normal filesystems should not use this to access their own inodes. */ static inline struct inode *d_backing_inode(const struct dentry *upper) { struct inode *inode = upper->d_inode; return inode; } /** * d_backing_dentry - Get upper or lower dentry we should be using * @upper: The upper layer * * This is the helper that should be used to get the dentry of the inode that * will be used if this dentry were opened as a file. It may be the upper * dentry or it may be a lower dentry pinned by the upper. * * Normal filesystems should not use this to access their own dentries. */ static inline struct dentry *d_backing_dentry(struct dentry *upper) { return upper; } /** * d_real - Return the real dentry * @dentry: the dentry to query * @inode: inode to select the dentry from multiple layers (can be NULL) * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. * * See also: Documentation/filesystems/vfs.rst */ static inline struct dentry *d_real(struct dentry *dentry, const struct inode *inode) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) return dentry->d_op->d_real(dentry, inode); else return dentry; } /** * d_real_inode - Return the real inode * @dentry: The dentry to query * * If dentry is on a union/overlay, then return the underlying, real inode. * Otherwise return d_inode(). */ static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ return d_backing_inode(d_real((struct dentry *) dentry, NULL)); } struct name_snapshot { struct qstr name; unsigned char inline_name[DNAME_INLINE_LEN]; }; void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); #endif /* __LINUX_DCACHE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2020 ARM Ltd. */ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H #ifndef __ASSEMBLY__ /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ static __always_inline void rep_nop(void) { asm volatile("rep; nop" ::: "memory"); } static __always_inline void cpu_relax(void) { rep_nop(); } #endif /* __ASSEMBLY__ */ #endif /* __ASM_VDSO_PROCESSOR_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for inet_sock * * Authors: Many, reorganised here by * Arnaldo Carvalho de Melo <acme@mandriva.com> */ #ifndef _INET_SOCK_H #define _INET_SOCK_H #include <linux/bitops.h> #include <linux/string.h> #include <linux/types.h> #include <linux/jhash.h> #include <linux/netdevice.h> #include <net/flow.h> #include <net/sock.h> #include <net/request_sock.h> #include <net/netns/hash.h> #include <net/tcp_states.h> #include <net/l3mdev.h> /** struct ip_options - IP Options * * @faddr - Saved first hop address * @nexthop - Saved nexthop address in LSRR and SSRR * @is_strictroute - Strict source route * @srr_is_hit - Packet destination addr was our one * @is_changed - IP checksum more not valid * @rr_needaddr - Need to record addr of outgoing dev * @ts_needtime - Need to record timestamp * @ts_needaddr - Need to record addr of outgoing dev */ struct ip_options { __be32 faddr; __be32 nexthop; unsigned char optlen; unsigned char srr; unsigned char rr; unsigned char ts; unsigned char is_strictroute:1, srr_is_hit:1, is_changed:1, rr_needaddr:1, ts_needtime:1, ts_needaddr:1; unsigned char router_alert; unsigned char cipso; unsigned char __pad2; unsigned char __data[]; }; struct ip_options_rcu { struct rcu_head rcu; struct ip_options opt; }; struct ip_options_data { struct ip_options_rcu opt; char data[40]; }; struct inet_request_sock { struct request_sock req; #define ir_loc_addr req.__req_common.skc_rcv_saddr #define ir_rmt_addr req.__req_common.skc_daddr #define ir_num req.__req_common.skc_num #define ir_rmt_port req.__req_common.skc_dport #define ir_v6_rmt_addr req.__req_common.skc_v6_daddr #define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr #define ir_iif req.__req_common.skc_bound_dev_if #define ir_cookie req.__req_common.skc_cookie #define ireq_net req.__req_common.skc_net #define ireq_state req.__req_common.skc_state #define ireq_family req.__req_common.skc_family u16 snd_wscale : 4, rcv_wscale : 4, tstamp_ok : 1, sack_ok : 1, wscale_ok : 1, ecn_ok : 1, acked : 1, no_srccheck: 1, smc_ok : 1; u32 ir_mark; union { struct ip_options_rcu __rcu *ireq_opt; #if IS_ENABLED(CONFIG_IPV6) struct { struct ipv6_txoptions *ipv6_opt; struct sk_buff *pktopts; }; #endif }; }; static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) { return (struct inet_request_sock *)sk; } static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) { if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) return skb->mark; return sk->sk_mark; } static inline int inet_request_bound_dev_if(const struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept) return l3mdev_master_ifindex_by_index(net, skb->skb_iif); #endif return sk->sk_bound_dev_if; } static inline int inet_sk_bound_l3mdev(const struct sock *sk) { #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); if (!net->ipv4.sysctl_tcp_l3mdev_accept) return l3mdev_master_ifindex_by_index(net, sk->sk_bound_dev_if); #endif return 0; } static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if, int dif, int sdif) { if (!bound_dev_if) return !sdif || l3mdev_accept; return bound_dev_if == dif || bound_dev_if == sdif; } struct inet_cork { unsigned int flags; __be32 addr; struct ip_options *opt; unsigned int fragsize; int length; /* Total length of all frames */ struct dst_entry *dst; u8 tx_flags; __u8 ttl; __s16 tos; char priority; __u16 gso_size; u64 transmit_time; u32 mark; }; struct inet_cork_full { struct inet_cork base; struct flowi fl; }; struct ip_mc_socklist; struct ipv6_pinfo; struct rtable; /** struct inet_sock - representation of INET sockets * * @sk - ancestor class * @pinet6 - pointer to IPv6 control block * @inet_daddr - Foreign IPv4 addr * @inet_rcv_saddr - Bound local IPv4 addr * @inet_dport - Destination port * @inet_num - Local port * @inet_saddr - Sending source * @uc_ttl - Unicast TTL * @inet_sport - Source port * @inet_id - ID counter for DF pkts * @tos - TOS * @mc_ttl - Multicasting TTL * @is_icsk - is this an inet_connection_sock? * @uc_index - Unicast outgoing device index * @mc_index - Multicast device index * @mc_list - Group array * @cork - info to build ip hdr on each ip frag while socket is corked */ struct inet_sock { /* sk and pinet6 has to be the first two members of inet_sock */ struct sock sk; #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *pinet6; #endif /* Socket demultiplex comparisons on incoming packets. */ #define inet_daddr sk.__sk_common.skc_daddr #define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr #define inet_dport sk.__sk_common.skc_dport #define inet_num sk.__sk_common.skc_num __be32 inet_saddr; __s16 uc_ttl; __u16 cmsg_flags; __be16 inet_sport; __u16 inet_id; struct ip_options_rcu __rcu *inet_opt; int rx_dst_ifindex; __u8 tos; __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; __u8 recverr:1, is_icsk:1, freebind:1, hdrincl:1, mc_loop:1, transparent:1, mc_all:1, nodefrag:1; __u8 bind_address_no_port:1, recverr_rfc4884:1, defer_connect:1; /* Indicates that fastopen_connect is set * and cookie exists so we defer connect * until first data frame is written */ __u8 rcv_tos; __u8 convert_csum; int uc_index; int mc_index; __be32 mc_addr; struct ip_mc_socklist __rcu *mc_list; struct inet_cork_full cork; }; #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ #define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ /* cmsg flags for inet */ #define IP_CMSG_PKTINFO BIT(0) #define IP_CMSG_TTL BIT(1) #define IP_CMSG_TOS BIT(2) #define IP_CMSG_RECVOPTS BIT(3) #define IP_CMSG_RETOPTS BIT(4) #define IP_CMSG_PASSSEC BIT(5) #define IP_CMSG_ORIGDSTADDR BIT(6) #define IP_CMSG_CHECKSUM BIT(7) #define IP_CMSG_RECVFRAGSIZE BIT(8) /** * sk_to_full_sk - Access to a full socket * @sk: pointer to a socket * * SYNACK messages might be attached to request sockets. * Some places want to reach the listener in this case. */ static inline struct sock *sk_to_full_sk(struct sock *sk) { #ifdef CONFIG_INET if (sk && sk->sk_state == TCP_NEW_SYN_RECV) sk = inet_reqsk(sk)->rsk_listener; #endif return sk; } /* sk_to_full_sk() variant with a const argument */ static inline const struct sock *sk_const_to_full_sk(const struct sock *sk) { #ifdef CONFIG_INET if (sk && sk->sk_state == TCP_NEW_SYN_RECV) sk = ((const struct request_sock *)sk)->rsk_listener; #endif return sk; } static inline struct sock *skb_to_full_sk(const struct sk_buff *skb) { return sk_to_full_sk(skb->sk); } static inline struct inet_sock *inet_sk(const struct sock *sk) { return (struct inet_sock *)sk; } static inline void __inet_sk_copy_descendant(struct sock *sk_to, const struct sock *sk_from, const int ancestor_size) { memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, sk_from->sk_prot->obj_size - ancestor_size); } int inet_sk_rebuild_header(struct sock *sk); /** * inet_sk_state_load - read sk->sk_state for lockless contexts * @sk: socket pointer * * Paired with inet_sk_state_store(). Used in places we don't hold socket lock: * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ... */ static inline int inet_sk_state_load(const struct sock *sk) { /* state change might impact lockless readers. */ return smp_load_acquire(&sk->sk_state); } /** * inet_sk_state_store - update sk->sk_state * @sk: socket pointer * @newstate: new state * * Paired with inet_sk_state_load(). Should be used in contexts where * state change might impact lockless readers. */ void inet_sk_state_store(struct sock *sk, int newstate); void inet_sk_set_state(struct sock *sk, int state); static inline unsigned int __inet_ehashfn(const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport, u32 initval) { return jhash_3words((__force __u32) laddr, (__force __u32) faddr, ((__u32) lport) << 16 | (__force __u32)fport, initval); } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener); static inline __u8 inet_sk_flowi_flags(const struct sock *sk) { __u8 flags = 0; if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl) flags |= FLOWI_FLAG_ANYSRC; return flags; } static inline void inet_inc_convert_csum(struct sock *sk) { inet_sk(sk)->convert_csum++; } static inline void inet_dec_convert_csum(struct sock *sk) { if (inet_sk(sk)->convert_csum > 0) inet_sk(sk)->convert_csum--; } static inline bool inet_get_convert_csum(struct sock *sk) { return !!inet_sk(sk)->convert_csum; } static inline bool inet_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return net->ipv4.sysctl_ip_nonlocal_bind || inet->freebind || inet->transparent; } #endif /* _INET_SOCK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM oom #if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_OOM_H #include <linux/tracepoint.h> #include <trace/events/mmflags.h> TRACE_EVENT(oom_score_adj_update, TP_PROTO(struct task_struct *task), TP_ARGS(task), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN ) __field( short, oom_score_adj) ), TP_fast_assign( __entry->pid = task->pid; memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->oom_score_adj = task->signal->oom_score_adj; ), TP_printk("pid=%d comm=%s oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->oom_score_adj) ); TRACE_EVENT(reclaim_retry_zone, TP_PROTO(struct zoneref *zoneref, int order, unsigned long reclaimable, unsigned long available, unsigned long min_wmark, int no_progress_loops, bool wmark_check), TP_ARGS(zoneref, order, reclaimable, available, min_wmark, no_progress_loops, wmark_check), TP_STRUCT__entry( __field( int, node) __field( int, zone_idx) __field( int, order) __field( unsigned long, reclaimable) __field( unsigned long, available) __field( unsigned long, min_wmark) __field( int, no_progress_loops) __field( bool, wmark_check) ), TP_fast_assign( __entry->node = zone_to_nid(zoneref->zone); __entry->zone_idx = zoneref->zone_idx; __entry->order = order; __entry->reclaimable = reclaimable; __entry->available = available; __entry->min_wmark = min_wmark; __entry->no_progress_loops = no_progress_loops; __entry->wmark_check = wmark_check; ), TP_printk("node=%d zone=%-8s order=%d reclaimable=%lu available=%lu min_wmark=%lu no_progress_loops=%d wmark_check=%d", __entry->node, __print_symbolic(__entry->zone_idx, ZONE_TYPE), __entry->order, __entry->reclaimable, __entry->available, __entry->min_wmark, __entry->no_progress_loops, __entry->wmark_check) ); TRACE_EVENT(mark_victim, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(wake_reaper, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(start_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(finish_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(skip_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); #ifdef CONFIG_COMPACTION TRACE_EVENT(compact_retry, TP_PROTO(int order, enum compact_priority priority, enum compact_result result, int retries, int max_retries, bool ret), TP_ARGS(order, priority, result, retries, max_retries, ret), TP_STRUCT__entry( __field( int, order) __field( int, priority) __field( int, result) __field( int, retries) __field( int, max_retries) __field( bool, ret) ), TP_fast_assign( __entry->order = order; __entry->priority = priority; __entry->result = compact_result_to_feedback(result); __entry->retries = retries; __entry->max_retries = max_retries; __entry->ret = ret; ), TP_printk("order=%d priority=%s compaction_result=%s retries=%d max_retries=%d should_retry=%d", __entry->order, __print_symbolic(__entry->priority, COMPACTION_PRIORITY), __print_symbolic(__entry->result, COMPACTION_FEEDBACK), __entry->retries, __entry->max_retries, __entry->ret) ); #endif /* CONFIG_COMPACTION */ #endif /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef PM_TRACE_H #define PM_TRACE_H #include <linux/types.h> #ifdef CONFIG_PM_TRACE #include <asm/pm-trace.h> extern int pm_trace_enabled; extern bool pm_trace_rtc_abused; static inline bool pm_trace_rtc_valid(void) { return !pm_trace_rtc_abused; } static inline int pm_trace_is_enabled(void) { return pm_trace_enabled; } struct device; extern void set_trace_device(struct device *); extern void generate_pm_trace(const void *tracedata, unsigned int user); extern int show_trace_dev_match(char *buf, size_t size); #define TRACE_DEVICE(dev) do { \ if (pm_trace_enabled) \ set_trace_device(dev); \ } while(0) #else static inline bool pm_trace_rtc_valid(void) { return true; } static inline int pm_trace_is_enabled(void) { return 0; } #define TRACE_DEVICE(dev) do { } while (0) #define TRACE_RESUME(dev) do { } while (0) #define TRACE_SUSPEND(dev) do { } while (0) #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 // SPDX-License-Identifier: GPL-2.0+ /* * ext4_jbd2.h * * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 * * Copyright 1998--1999 Red Hat corp --- All Rights Reserved * * Ext4-specific journaling extensions. */ #ifndef _EXT4_JBD2_H #define _EXT4_JBD2_H #include <linux/fs.h> #include <linux/jbd2.h> #include "ext4.h" #define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) /* Define the number of blocks we need to account to a transaction to * modify one block of data. * * We may have to touch one inode, one bitmap buffer, up to three * indirection blocks, the group and superblock summaries, and the data * block to complete the transaction. * * For extents-enabled fs we may have to allocate and modify up to * 5 levels of tree, data block (for each of these we need bitmap + group * summaries), root which is stored in the inode, sb */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ (ext4_has_feature_extents(sb) ? 20U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode * and the superblock, which are already accounted for. */ #define EXT4_XATTR_TRANS_BLOCKS 6U /* Define the minimum size for a transaction which modifies data. This * needs to take into account the fact that we may end up modifying two * quota files too (one for the group, one for the user quota). The * superblock only gets updated once, of course, so don't bother * counting that again for the quota updates. */ #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT4_XATTR_TRANS_BLOCKS - 2 + \ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* * Define the number of metadata blocks we need to account to modify data. * * This include super block, inode block, quota blocks and xattr blocks */ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* Define an arbitrary limit for the amount of data we will anticipate * writing to any given transaction. For unbounded transactions such as * write(2) and truncate(2) we can write more than this, but we always * start off at the maximum transaction size and grow the transaction * optimistically as we go. */ #define EXT4_MAX_TRANS_DATA 64U /* We break up a large truncate or write transaction once the handle's * buffer credits gets this low, we need either to extend the * transaction or to start a new one. Reserve enough space here for * inode, bitmap, superblock, group and indirection updates for at least * one block, plus two quota updates. Quota allocations are not * needed. */ #define EXT4_RESERVE_TRANS_BLOCKS 12U /* * Number of credits needed if we need to insert an entry into a * directory. For each new index block, we need 4 blocks (old index * block, new index block, bitmap block, bg summary). For normal * htree directories there are 2 levels; if the largedir feature * enabled it's 3 levels. */ #define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U #ifdef CONFIG_QUOTA /* Amount of blocks needed for quota update - we know that the structure was * allocated so we need to update only data block */ #define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ ext4_has_feature_quota(sb)) ? 1 : 0) /* Amount of blocks needed for quota insert/delete - we do some block writes * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ ext4_has_feature_quota(sb)) ?\ (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_INIT_REWRITE) : 0) #define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ ext4_has_feature_quota(sb)) ?\ (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_DEL_REWRITE) : 0) #else #define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif #define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) /* * Ext4 handle operation types -- for logging purposes */ #define EXT4_HT_MISC 0 #define EXT4_HT_INODE 1 #define EXT4_HT_WRITE_PAGE 2 #define EXT4_HT_MAP_BLOCKS 3 #define EXT4_HT_DIR 4 #define EXT4_HT_TRUNCATE 5 #define EXT4_HT_QUOTA 6 #define EXT4_HT_RESIZE 7 #define EXT4_HT_MIGRATE 8 #define EXT4_HT_MOVE_EXTENTS 9 #define EXT4_HT_XATTR 10 #define EXT4_HT_EXT_CONVERT 11 #define EXT4_HT_MAX 12 /** * struct ext4_journal_cb_entry - Base structure for callback information. * * This struct is a 'seed' structure for a using with your own callback * structs. If you are using callbacks you must allocate one of these * or another struct of your own definition which has this struct * as it's first element and pass it to ext4_journal_callback_add(). */ struct ext4_journal_cb_entry { /* list information for other callbacks attached to the same handle */ struct list_head jce_list; /* Function to call with this callback structure */ void (*jce_func)(struct super_block *sb, struct ext4_journal_cb_entry *jce, int error); /* user data goes here */ }; /** * ext4_journal_callback_add: add a function to call after transaction commit * @handle: active journal transaction handle to register callback on * @func: callback function to call after the transaction has committed: * @sb: superblock of current filesystem for transaction * @jce: returned journal callback data * @rc: journal state at commit (0 = transaction committed properly) * @jce: journal callback data (internal and function private data struct) * * The registered function will be called in the context of the journal thread * after the transaction for which the handle was created has completed. * * No locks are held when the callback function is called, so it is safe to * call blocking functions from within the callback, but the callback should * not block or run for too long, or the filesystem will be blocked waiting for * the next transaction to commit. No journaling functions can be used, or * there is a risk of deadlock. * * There is no guaranteed calling order of multiple registered callbacks on * the same transaction. */ static inline void _ext4_journal_callback_add(handle_t *handle, struct ext4_journal_cb_entry *jce) { /* Add the jce to transaction's private list */ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); } static inline void ext4_journal_callback_add(handle_t *handle, void (*func)(struct super_block *sb, struct ext4_journal_cb_entry *jce, int rc), struct ext4_journal_cb_entry *jce) { struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); /* Add the jce to transaction's private list */ jce->jce_func = func; spin_lock(&sbi->s_md_lock); _ext4_journal_callback_add(handle, jce); spin_unlock(&sbi->s_md_lock); } /** * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered * @jce: registered journal callback entry to unregister * Return true if object was successfully removed */ static inline bool ext4_journal_callback_try_del(handle_t *handle, struct ext4_journal_cb_entry *jce) { bool deleted; struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); spin_lock(&sbi->s_md_lock); deleted = !list_empty(&jce->jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); return deleted; } int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc); /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. */ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc); #define ext4_mark_inode_dirty(__h, __i) \ __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__) int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, const char *func, unsigned int line); int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc); /* * Wrapper functions with which ext4 calls into JBD. */ int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh); int __ext4_forget(const char *where, unsigned int line, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh); int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh); int __ext4_handle_dirty_super(const char *where, unsigned int line, handle_t *handle, struct super_block *sb); #define ext4_journal_get_write_access(handle, bh) \ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ (bh), (block_nr)) #define ext4_journal_get_create_access(handle, bh) \ __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) #define ext4_handle_dirty_super(handle, sb) \ __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) /* Note: Do not use this for NULL handles. This is only to determine if * a properly allocated handle is using a journal or not. */ static inline int ext4_handle_valid(handle_t *handle) { if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) return 0; return 1; } static inline void ext4_handle_sync(handle_t *handle) { if (ext4_handle_valid(handle)) handle->h_sync = 1; } static inline int ext4_handle_is_aborted(handle_t *handle) { if (ext4_handle_valid(handle)) return is_handle_aborted(handle); return 0; } static inline int ext4_free_metadata_revoke_credits(struct super_block *sb, int blocks) { /* Freeing each metadata block can result in freeing one cluster */ return blocks * EXT4_SB(sb)->s_cluster_ratio; } static inline int ext4_trans_default_revoke_credits(struct super_block *sb) { return ext4_free_metadata_revoke_credits(sb, 8); } #define ext4_journal_start_sb(sb, type, nblocks) \ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \ ext4_trans_default_revoke_credits(sb)) #define ext4_journal_start(inode, type, nblocks) \ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \ ext4_trans_default_revoke_credits((inode)->i_sb)) #define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\ ext4_trans_default_revoke_credits((inode)->i_sb)) #define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \ __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \ (revoke_creds)) static inline handle_t *__ext4_journal_start(struct inode *inode, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds) { return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, rsv_blocks, revoke_creds); } #define ext4_journal_stop(handle) \ __ext4_journal_stop(__func__, __LINE__, (handle)) #define ext4_journal_start_reserved(handle, type) \ __ext4_journal_start_reserved((handle), __LINE__, (type)) handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, int type); static inline handle_t *ext4_journal_current_handle(void) { return journal_current_handle(); } static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) return jbd2_journal_extend(handle, nblocks, revoke); return 0; } static inline int ext4_journal_restart(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS); return 0; } int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, int extend_cred, int revoke_cred); /* * Ensure @handle has at least @check_creds credits available. If not, * transaction will be extended or restarted to contain at least @extend_cred * credits. Before restarting transaction @fn is executed to allow for cleanup * before the transaction is restarted. * * The return value is < 0 in case of error, 0 in case the handle has enough * credits or transaction extension succeeded, 1 in case transaction had to be * restarted. */ #define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \ revoke_cred, fn) \ ({ \ __label__ __ensure_end; \ int err = __ext4_journal_ensure_credits((handle), (check_cred), \ (extend_cred), (revoke_cred)); \ \ if (err <= 0) \ goto __ensure_end; \ err = (fn); \ if (err < 0) \ goto __ensure_end; \ err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \ if (err == 0) \ err = 1; \ __ensure_end: \ err; \ }) /* * Ensure given handle has at least requested amount of credits available, * possibly restarting transaction if needed. We also make sure the transaction * has space for at least ext4_trans_default_revoke_credits(sb) revoke records * as freeing one or two blocks is very common pattern and requesting this is * very cheap. */ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, int revoke_creds) { return ext4_journal_ensure_credits_fn(handle, credits, credits, revoke_creds, 0); } static inline int ext4_journal_blocks_per_page(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) return jbd2_journal_blocks_per_page(inode); return 0; } static inline int ext4_journal_force_commit(journal_t *journal) { if (journal) return jbd2_journal_force_commit(journal); return 0; } static inline int ext4_jbd2_inode_add_write(handle_t *handle, struct inode *inode, loff_t start_byte, loff_t length) { if (ext4_handle_valid(handle)) return jbd2_journal_inode_ranged_write(handle, EXT4_I(inode)->jinode, start_byte, length); return 0; } static inline int ext4_jbd2_inode_add_wait(handle_t *handle, struct inode *inode, loff_t start_byte, loff_t length) { if (ext4_handle_valid(handle)) return jbd2_journal_inode_ranged_wait(handle, EXT4_I(inode)->jinode, start_byte, length); return 0; } static inline void ext4_update_inode_fsync_trans(handle_t *handle, struct inode *inode, int datasync) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; if (datasync) ei->i_datasync_tid = handle->h_transaction->t_tid; } } /* super.c */ int ext4_force_commit(struct super_block *sb); /* * Ext4 inode journal modes */ #define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ #define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ #define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ int ext4_inode_journal_mode(struct inode *inode); static inline int ext4_should_journal_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; } static inline int ext4_should_order_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; } static inline int ext4_should_writeback_data(struct inode *inode) { return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) { if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) return 0; if (!ext4_should_journal_data(inode)) return 0; /* * Data blocks in one extent are contiguous, just account for partial * clusters at extent boundaries */ return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1); } /* * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking * i_mutex for direct I/O reads. This only works for extent-based * files, and it doesn't work if data journaling is enabled, since the * dioread_nolock code uses b_private to pass information back to the * I/O completion handler, and this conflicts with the jbd's use of * b_private. */ static inline int ext4_should_dioread_nolock(struct inode *inode) { if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) return 0; if (!S_ISREG(inode->i_mode)) return 0; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return 0; if (ext4_should_journal_data(inode)) return 0; /* temporary fix to prevent generic/422 test failures */ if (!test_opt(inode->i_sb, DELALLOC)) return 0; return 1; } #endif /* _EXT4_JBD2_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NAMEI_H #define _LINUX_NAMEI_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/path.h> #include <linux/fcntl.h> #include <linux/errno.h> enum { MAX_NESTED_LINKS = 8 }; #define MAXSYMLINKS 40 /* * Type of the last component on LOOKUP_PARENT */ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* pathwalk mode */ #define LOOKUP_FOLLOW 0x0001 /* follow links at the end */ #define LOOKUP_DIRECTORY 0x0002 /* require a directory */ #define LOOKUP_AUTOMOUNT 0x0004 /* force terminal automount */ #define LOOKUP_EMPTY 0x4000 /* accept empty path [user_... only] */ #define LOOKUP_DOWN 0x8000 /* follow mounts in the starting point */ #define LOOKUP_MOUNTPOINT 0x0080 /* follow mounts in the end */ #define LOOKUP_REVAL 0x0020 /* tell ->d_revalidate() to trust no cache */ #define LOOKUP_RCU 0x0040 /* RCU pathwalk mode; semi-internal */ /* These tell filesystem methods that we are dealing with the final component... */ #define LOOKUP_OPEN 0x0100 /* ... in open */ #define LOOKUP_CREATE 0x0200 /* ... in object creation */ #define LOOKUP_EXCL 0x0400 /* ... in exclusive creation */ #define LOOKUP_RENAME_TARGET 0x0800 /* ... in destination of rename() */ /* internal use only */ #define LOOKUP_PARENT 0x0010 #define LOOKUP_JUMPED 0x1000 #define LOOKUP_ROOT 0x2000 #define LOOKUP_ROOT_GRABBED 0x0008 /* Scoping flags for lookup. */ #define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ #define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */ #define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */ #define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */ #define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */ /* LOOKUP_* flags which do scope-related checks based on the dirfd. */ #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT) extern int path_pts(struct path *path); extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); static inline int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { return user_path_at_empty(dfd, name, flags, path, NULL); } extern int kern_path(const char *, unsigned, struct path *); extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int); extern int follow_down_one(struct path *); extern int follow_down(struct path *); extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); extern int __must_check nd_jump_link(struct path *path); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) { ((char *) name)[min(len, maxlen)] = '\0'; } /** * retry_estale - determine whether the caller should retry an operation * @error: the error that would currently be returned * @flags: flags being used for next lookup attempt * * Check to see if the error code was -ESTALE, and then determine whether * to retry the call based on whether "flags" already has LOOKUP_REVAL set. * * Returns true if the caller should try the operation again. */ static inline bool retry_estale(const long error, const unsigned int flags) { return error == -ESTALE && !(flags & LOOKUP_REVAL); } #endif /* _LINUX_NAMEI_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 /* SPDX-License-Identifier: GPL-2.0 */ /* * Generic nexthop implementation * * Copyright (c) 2017-19 Cumulus Networks * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> */ #ifndef __LINUX_NEXTHOP_H #define __LINUX_NEXTHOP_H #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/route.h> #include <linux/types.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/netlink.h> #define NEXTHOP_VALID_USER_FLAGS RTNH_F_ONLINK struct nexthop; struct nh_config { u32 nh_id; u8 nh_family; u8 nh_protocol; u8 nh_blackhole; u8 nh_fdb; u32 nh_flags; int nh_ifindex; struct net_device *dev; union { __be32 ipv4; struct in6_addr ipv6; } gw; struct nlattr *nh_grp; u16 nh_grp_type; struct nlattr *nh_encap; u16 nh_encap_type; u32 nlflags; struct nl_info nlinfo; }; struct nh_info { struct hlist_node dev_hash; /* entry on netns devhash */ struct nexthop *nh_parent; u8 family; bool reject_nh; bool fdb_nh; union { struct fib_nh_common fib_nhc; struct fib_nh fib_nh; struct fib6_nh fib6_nh; }; }; struct nh_grp_entry { struct nexthop *nh; u8 weight; atomic_t upper_bound; struct list_head nh_list; struct nexthop *nh_parent; /* nexthop of group with this entry */ }; struct nh_group { struct nh_group *spare; /* spare group for removals */ u16 num_nh; bool mpath; bool fdb_nh; bool has_v4; struct nh_grp_entry nh_entries[]; }; struct nexthop { struct rb_node rb_node; /* entry on netns rbtree */ struct list_head fi_list; /* v4 entries using nh */ struct list_head f6i_list; /* v6 entries using nh */ struct list_head fdb_list; /* fdb entries using this nh */ struct list_head grp_list; /* nh group entries using this nh */ struct net *net; u32 id; u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; refcount_t refcnt; struct rcu_head rcu; union { struct nh_info __rcu *nh_info; struct nh_group __rcu *nh_grp; }; }; enum nexthop_event_type { NEXTHOP_EVENT_DEL }; int register_nexthop_notifier(struct net *net, struct notifier_block *nb); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); /* caller is holding rcu or rtnl; no reference taken to nexthop */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id); void nexthop_free_rcu(struct rcu_head *head); static inline bool nexthop_get(struct nexthop *nh) { return refcount_inc_not_zero(&nh->refcnt); } static inline void nexthop_put(struct nexthop *nh) { if (refcount_dec_and_test(&nh->refcnt)) call_rcu(&nh->rcu, nexthop_free_rcu); } static inline bool nexthop_cmp(const struct nexthop *nh1, const struct nexthop *nh2) { return nh1 == nh2; } static inline bool nexthop_is_fdb(const struct nexthop *nh) { if (nh->is_group) { const struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); return nh_grp->fdb_nh; } else { const struct nh_info *nhi; nhi = rcu_dereference_rtnl(nh->nh_info); return nhi->fdb_nh; } } static inline bool nexthop_has_v4(const struct nexthop *nh) { if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); return nh_grp->has_v4; } return false; } static inline bool nexthop_is_multipath(const struct nexthop *nh) { if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); return nh_grp->mpath; } return false; } struct nexthop *nexthop_select_path(struct nexthop *nh, int hash); static inline unsigned int nexthop_num_path(const struct nexthop *nh) { unsigned int rc = 1; if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); if (nh_grp->mpath) rc = nh_grp->num_nh; } return rc; } static inline struct nexthop *nexthop_mpath_select(const struct nh_group *nhg, int nhsel) { /* for_nexthops macros in fib_semantics.c grabs a pointer to * the nexthop before checking nhsel */ if (nhsel >= nhg->num_nh) return NULL; return nhg->nh_entries[nhsel].nh; } static inline int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh, u8 rt_family) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { struct nexthop *nhe = nhg->nh_entries[i].nh; struct nh_info *nhi = rcu_dereference_rtnl(nhe->nh_info); struct fib_nh_common *nhc = &nhi->fib_nhc; int weight = nhg->nh_entries[i].weight; if (fib_add_nexthop(skb, nhc, weight, rt_family, 0) < 0) return -EMSGSIZE; } return 0; } /* called with rcu lock */ static inline bool nexthop_is_blackhole(const struct nexthop *nh) { const struct nh_info *nhi; if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); if (nh_grp->num_nh > 1) return false; nh = nh_grp->nh_entries[0].nh; } nhi = rcu_dereference_rtnl(nh->nh_info); return nhi->reject_nh; } static inline void nexthop_path_fib_result(struct fib_result *res, int hash) { struct nh_info *nhi; struct nexthop *nh; nh = nexthop_select_path(res->fi->nh, hash); nhi = rcu_dereference(nh->nh_info); res->nhc = &nhi->fib_nhc; } /* called with rcu read lock or rtnl held */ static inline struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel) { struct nh_info *nhi; BUILD_BUG_ON(offsetof(struct fib_nh, nh_common) != 0); BUILD_BUG_ON(offsetof(struct fib6_nh, nh_common) != 0); if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); if (nh_grp->mpath) { nh = nexthop_mpath_select(nh_grp, nhsel); if (!nh) return NULL; } } nhi = rcu_dereference_rtnl(nh->nh_info); return &nhi->fib_nhc; } /* called from fib_table_lookup with rcu_lock */ static inline struct fib_nh_common *nexthop_get_nhc_lookup(const struct nexthop *nh, int fib_flags, const struct flowi4 *flp, int *nhsel) { struct nh_info *nhi; if (nh->is_group) { struct nh_group *nhg = rcu_dereference(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { struct nexthop *nhe = nhg->nh_entries[i].nh; nhi = rcu_dereference(nhe->nh_info); if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) { *nhsel = i; return &nhi->fib_nhc; } } } else { nhi = rcu_dereference(nh->nh_info); if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) { *nhsel = 0; return &nhi->fib_nhc; } } return NULL; } static inline bool nexthop_uses_dev(const struct nexthop *nh, const struct net_device *dev) { struct nh_info *nhi; if (nh->is_group) { struct nh_group *nhg = rcu_dereference(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { struct nexthop *nhe = nhg->nh_entries[i].nh; nhi = rcu_dereference(nhe->nh_info); if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev)) return true; } } else { nhi = rcu_dereference(nh->nh_info); if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev)) return true; } return false; } static inline unsigned int fib_info_num_path(const struct fib_info *fi) { if (unlikely(fi->nh)) return nexthop_num_path(fi->nh); return fi->fib_nhs; } int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack); static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel) { if (unlikely(fi->nh)) return nexthop_fib_nhc(fi->nh, nhsel); return &fi->fib_nh[nhsel].nh_common; } /* only used when fib_nh is built into fib_info */ static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel) { WARN_ON(fi->nh); return &fi->fib_nh[nhsel]; } /* * IPv6 variants */ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack); /* Caller should either hold rcu_read_lock(), or RTNL. */ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh) { struct nh_info *nhi; if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); nh = nexthop_mpath_select(nh_grp, 0); if (!nh) return NULL; } nhi = rcu_dereference_rtnl(nh->nh_info); if (nhi->family == AF_INET6) return &nhi->fib6_nh; return NULL; } /* Variant of nexthop_fib6_nh(). * Caller should either hold rcu_read_lock_bh(), or RTNL. */ static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh) { struct nh_info *nhi; if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_bh_rtnl(nh->nh_grp); nh = nexthop_mpath_select(nh_grp, 0); if (!nh) return NULL; } nhi = rcu_dereference_bh_rtnl(nh->nh_info); if (nhi->family == AF_INET6) return &nhi->fib6_nh; return NULL; } static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i) { struct fib6_nh *fib6_nh; fib6_nh = f6i->nh ? nexthop_fib6_nh(f6i->nh) : f6i->fib6_nh; return fib6_nh->fib_nh_dev; } static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash) { struct nexthop *nh = res->f6i->nh; struct nh_info *nhi; nh = nexthop_select_path(nh, hash); nhi = rcu_dereference_rtnl(nh->nh_info); if (nhi->reject_nh) { res->fib6_type = RTN_BLACKHOLE; res->fib6_flags |= RTF_REJECT; res->nh = nexthop_fib6_nh(nh); } else { res->nh = &nhi->fib6_nh; } } int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg); static inline int nexthop_get_family(struct nexthop *nh) { struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); return nhi->family; } static inline struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh) { struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); return &nhi->fib_nhc; } static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh, int hash) { struct nh_info *nhi; struct nexthop *nhp; nhp = nexthop_select_path(nh, hash); if (unlikely(!nhp)) return NULL; nhi = rcu_dereference(nhp->nh_info); return &nhi->fib_nhc; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 /* SPDX-License-Identifier: GPL-2.0 */ /* * Released under the GPLv2 only. */ #include <linux/pm.h> #include <linux/acpi.h> struct usb_hub_descriptor; struct usb_dev_state; /* Functions local to drivers/usb/core/ */ extern int usb_create_sysfs_dev_files(struct usb_device *dev); extern void usb_remove_sysfs_dev_files(struct usb_device *dev); extern void usb_create_sysfs_intf_files(struct usb_interface *intf); extern void usb_remove_sysfs_intf_files(struct usb_interface *intf); extern int usb_create_ep_devs(struct device *parent, struct usb_host_endpoint *endpoint, struct usb_device *udev); extern void usb_remove_ep_devs(struct usb_host_endpoint *endpoint); extern void usb_enable_endpoint(struct usb_device *dev, struct usb_host_endpoint *ep, bool reset_toggle); extern void usb_enable_interface(struct usb_device *dev, struct usb_interface *intf, bool reset_toggles); extern void usb_disable_endpoint(struct usb_device *dev, unsigned int epaddr, bool reset_hardware); extern void usb_disable_interface(struct usb_device *dev, struct usb_interface *intf, bool reset_hardware); extern void usb_release_interface_cache(struct kref *ref); extern void usb_disable_device(struct usb_device *dev, int skip_ep0); extern int usb_deauthorize_device(struct usb_device *); extern int usb_authorize_device(struct usb_device *); extern void usb_deauthorize_interface(struct usb_interface *); extern void usb_authorize_interface(struct usb_interface *); extern void usb_detect_quirks(struct usb_device *udev); extern void usb_detect_interface_quirks(struct usb_device *udev); extern void usb_release_quirk_list(void); extern bool usb_endpoint_is_ignored(struct usb_device *udev, struct usb_host_interface *intf, struct usb_endpoint_descriptor *epd); extern int usb_remove_device(struct usb_device *udev); extern int usb_get_device_descriptor(struct usb_device *dev, unsigned int size); extern int usb_set_isoch_delay(struct usb_device *dev); extern int usb_get_bos_descriptor(struct usb_device *dev); extern void usb_release_bos_descriptor(struct usb_device *dev); extern char *usb_cache_string(struct usb_device *udev, int index); extern int usb_set_configuration(struct usb_device *dev, int configuration); extern int usb_choose_configuration(struct usb_device *udev); extern int usb_generic_driver_probe(struct usb_device *udev); extern void usb_generic_driver_disconnect(struct usb_device *udev); extern int usb_generic_driver_suspend(struct usb_device *udev, pm_message_t msg); extern int usb_generic_driver_resume(struct usb_device *udev, pm_message_t msg); static inline unsigned usb_get_max_power(struct usb_device *udev, struct usb_host_config *c) { /* SuperSpeed power is in 8 mA units; others are in 2 mA units */ unsigned mul = (udev->speed >= USB_SPEED_SUPER ? 8 : 2); return c->desc.bMaxPower * mul; } extern void usb_kick_hub_wq(struct usb_device *dev); extern int usb_match_one_id_intf(struct usb_device *dev, struct usb_host_interface *intf, const struct usb_device_id *id); extern int usb_match_device(struct usb_device *dev, const struct usb_device_id *id); extern const struct usb_device_id *usb_device_match_id(struct usb_device *udev, const struct usb_device_id *id); extern bool usb_driver_applicable(struct usb_device *udev, struct usb_device_driver *udrv); extern void usb_forced_unbind_intf(struct usb_interface *intf); extern void usb_unbind_and_rebind_marked_interfaces(struct usb_device *udev); extern void usb_hub_release_all_ports(struct usb_device *hdev, struct usb_dev_state *owner); extern bool usb_device_is_owned(struct usb_device *udev); extern int usb_hub_init(void); extern void usb_hub_cleanup(void); extern int usb_major_init(void); extern void usb_major_cleanup(void); extern int usb_device_supports_lpm(struct usb_device *udev); extern int usb_port_disable(struct usb_device *udev); #ifdef CONFIG_PM extern int usb_suspend(struct device *dev, pm_message_t msg); extern int usb_resume(struct device *dev, pm_message_t msg); extern int usb_resume_complete(struct device *dev); extern int usb_port_suspend(struct usb_device *dev, pm_message_t msg); extern int usb_port_resume(struct usb_device *dev, pm_message_t msg); extern void usb_autosuspend_device(struct usb_device *udev); extern int usb_autoresume_device(struct usb_device *udev); extern int usb_remote_wakeup(struct usb_device *dev); extern int usb_runtime_suspend(struct device *dev); extern int usb_runtime_resume(struct device *dev); extern int usb_runtime_idle(struct device *dev); extern int usb_enable_usb2_hardware_lpm(struct usb_device *udev); extern int usb_disable_usb2_hardware_lpm(struct usb_device *udev); extern void usbfs_notify_suspend(struct usb_device *udev); extern void usbfs_notify_resume(struct usb_device *udev); #else static inline int usb_port_suspend(struct usb_device *udev, pm_message_t msg) { return 0; } static inline int usb_port_resume(struct usb_device *udev, pm_message_t msg) { return 0; } #define usb_autosuspend_device(udev) do {} while (0) static inline int usb_autoresume_device(struct usb_device *udev) { return 0; } static inline int usb_enable_usb2_hardware_lpm(struct usb_device *udev) { return 0; } static inline int usb_disable_usb2_hardware_lpm(struct usb_device *udev) { return 0; } #endif extern struct bus_type usb_bus_type; extern struct mutex usb_port_peer_mutex; extern struct device_type usb_device_type; extern struct device_type usb_if_device_type; extern struct device_type usb_ep_device_type; extern struct device_type usb_port_device_type; extern struct usb_device_driver usb_generic_driver; static inline int is_usb_device(const struct device *dev) { return dev->type == &usb_device_type; } static inline int is_usb_interface(const struct device *dev) { return dev->type == &usb_if_device_type; } static inline int is_usb_endpoint(const struct device *dev) { return dev->type == &usb_ep_device_type; } static inline int is_usb_port(const struct device *dev) { return dev->type == &usb_port_device_type; } static inline int is_root_hub(struct usb_device *udev) { return (udev->parent == NULL); } /* Do the same for device drivers and interface drivers. */ static inline int is_usb_device_driver(struct device_driver *drv) { return container_of(drv, struct usbdrv_wrap, driver)-> for_devices; } /* for labeling diagnostics */ extern const char *usbcore_name; /* sysfs stuff */ extern const struct attribute_group *usb_device_groups[]; extern const struct attribute_group *usb_interface_groups[]; /* usbfs stuff */ extern struct usb_driver usbfs_driver; extern const struct file_operations usbfs_devices_fops; extern const struct file_operations usbdev_file_operations; extern int usb_devio_init(void); extern void usb_devio_cleanup(void); /* * Firmware specific cookie identifying a port's location. '0' == no location * data available */ typedef u32 usb_port_location_t; /* internal notify stuff */ extern void usb_notify_add_device(struct usb_device *udev); extern void usb_notify_remove_device(struct usb_device *udev); extern void usb_notify_add_bus(struct usb_bus *ubus); extern void usb_notify_remove_bus(struct usb_bus *ubus); extern void usb_hub_adjust_deviceremovable(struct usb_device *hdev, struct usb_hub_descriptor *desc); #ifdef CONFIG_ACPI extern int usb_acpi_register(void); extern void usb_acpi_unregister(void); extern acpi_handle usb_get_hub_port_acpi_handle(struct usb_device *hdev, int port1); #else static inline int usb_acpi_register(void) { return 0; }; static inline void usb_acpi_unregister(void) { }; #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 /* SPDX-License-Identifier: GPL-2.0 */ /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some * particular ordering. One way to make the compiler aware of ordering is to * put the two invocations of READ_ONCE or WRITE_ONCE in different C * statements. * * These two macros will also work on aggregate data types like structs or * unions. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. */ #ifndef __ASM_GENERIC_RWONCE_H #define __ASM_GENERIC_RWONCE_H #ifndef __ASSEMBLY__ #include <linux/compiler_types.h> #include <linux/kasan-checks.h> #include <linux/kcsan-checks.h> /* * Yes, this permits 64-bit accesses on 32-bit architectures. These will * actually be atomic in some cases (namely Armv7 + LPAE), but for others we * rely on the access being split into 2x32-bit accesses for a 32-bit quantity * (e.g. a virtual address) and a strong prevailing wind. */ #define compiletime_assert_rwonce_type(t) \ compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ "Unsupported access size for {READ,WRITE}_ONCE().") /* * Use __READ_ONCE() instead of READ_ONCE() if you do not require any * atomicity. Note that this may result in tears! */ #ifndef __READ_ONCE #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) #endif #define READ_ONCE(x) \ ({ \ compiletime_assert_rwonce_type(x); \ __READ_ONCE(x); \ }) #define __WRITE_ONCE(x, val) \ do { \ *(volatile typeof(x) *)&(x) = (val); \ } while (0) #define WRITE_ONCE(x, val) \ do { \ compiletime_assert_rwonce_type(x); \ __WRITE_ONCE(x, val); \ } while (0) static __no_sanitize_or_inline unsigned long __read_once_word_nocheck(const void *addr) { return __READ_ONCE(*(unsigned long *)addr); } /* * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a * word from memory atomically but without telling KASAN/KCSAN. This is * usually used by unwinding code when walking the stack of a running process. */ #define READ_ONCE_NOCHECK(x) \ ({ \ compiletime_assert(sizeof(x) == sizeof(unsigned long), \ "Unsupported access size for READ_ONCE_NOCHECK()."); \ (typeof(x))__read_once_word_nocheck(&(x)); \ }) static __no_kasan_or_inline unsigned long read_word_at_a_time(const void *addr) { kasan_check_read(addr, 1); return *(unsigned long *)addr; } #endif /* __ASSEMBLY__ */ #endif /* __ASM_GENERIC_RWONCE_H */
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Fixes: * Alan Cox : Numerous verify_area() problems * Alan Cox : Connecting on a connecting socket * now returns an error for tcp. * Alan Cox : sock->protocol is set correctly. * and is not sometimes left as 0. * Alan Cox : connect handles icmp errors on a * connect properly. Unfortunately there * is a restart syscall nasty there. I * can't match BSD without hacking the C * library. Ideas urgently sought! * Alan Cox : Disallow bind() to addresses that are * not ours - especially broadcast ones!! * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, * instead they leave that for the DESTROY timer. * Alan Cox : Clean up error flag in accept * Alan Cox : TCP ack handling is buggy, the DESTROY timer * was buggy. Put a remove_sock() in the handler * for memory when we hit 0. Also altered the timer * code. The ACK stuff can wait and needs major * TCP layer surgery. * Alan Cox : Fixed TCP ack bug, removed remove sock * and fixed timer/inet_bh race. * Alan Cox : Added zapped flag for TCP * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... * Rick Sladkey : Relaxed UDP rules for matching packets. * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support * Pauline Middelink : identd support * Alan Cox : Fixed connect() taking signals I think. * Alan Cox : SO_LINGER supported * Alan Cox : Error reporting fixes * Anonymous : inet_create tidied up (sk->reuse setting) * Alan Cox : inet sockets don't set sk->type! * Alan Cox : Split socket option code * Alan Cox : Callbacks * Alan Cox : Nagle flag for Charles & Johannes stuff * Alex : Removed restriction on inet fioctl * Alan Cox : Splitting INET from NET core * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code * Alan Cox : Split IP from generic code * Alan Cox : New kfree_skbmem() * Alan Cox : Make SO_DEBUG superuser only. * Alan Cox : Allow anyone to clear SO_DEBUG * (compatibility fix) * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. * Alan Cox : Allocator for a socket is settable. * Alan Cox : SO_ERROR includes soft errors. * Alan Cox : Allow NULL arguments on some SO_ opts * Alan Cox : Generic socket allocation to make hooks * easier (suggested by Craig Metz). * Michael Pall : SO_ERROR returns positive errno again * Steve Whitehouse: Added default destructor to free * protocol private data. * Steve Whitehouse: Added various other default routines * common to several socket families. * Chris Evans : Call suser() check last on F_SETOWN * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() * Andi Kleen : Fix write_space callback * Chris Evans : Security fixes - signedness again * Arnaldo C. Melo : cleanups, use skb_queue_purge * * To Fix: */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/poll.h> #include <linux/tcp.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/user_namespace.h> #include <linux/static_key.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> #include <net/cls_cgroup.h> #include <net/netprio_cgroup.h> #include <linux/sock_diag.h> #include <linux/filter.h> #include <net/sock_reuseport.h> #include <net/bpf_sk_storage.h> #include <trace/events/sock.h> #include <net/tcp.h> #include <net/busy_poll.h> static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); static void sock_inuse_add(struct net *net, int val); /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in the user * namespace @user_ns. */ bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap) { return file_ns_capable(sk->sk_socket->file, user_ns, cap) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(sk_ns_capable); /** * sk_capable - Socket global capability test * @sk: Socket to use a capability on or through * @cap: The global capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in all user * namespaces. */ bool sk_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, &init_user_ns, cap); } EXPORT_SYMBOL(sk_capable); /** * sk_net_capable - Network namespace socket capability test * @sk: Socket to use a capability on or through * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was created * and the current process has the capability @cap over the network namespace * the socket is a member of. */ bool sk_net_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); } EXPORT_SYMBOL(sk_net_capable); /* * Each address family might have different locking rules, so we have * one slock key per address family and separate keys for internal and * userspace sockets. */ static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_kern_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket * locks is fast): */ #define _sock_locks(x) \ x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ x "27" , x "28" , x "AF_CAN" , \ x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { _sock_locks("sk_lock-") }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { _sock_locks("slock-") }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { _sock_locks("clock-") }; static const char *const af_family_kern_key_strings[AF_MAX+1] = { _sock_locks("k-sk_lock-") }; static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { _sock_locks("k-slock-") }; static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { _sock_locks("k-clock-") }; static const char *const af_family_rlock_key_strings[AF_MAX+1] = { _sock_locks("rlock-") }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { _sock_locks("wlock-") }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { _sock_locks("elock-") }; /* * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; static struct lock_class_key af_rlock_keys[AF_MAX]; static struct lock_class_key af_wlock_keys[AF_MAX]; static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; EXPORT_SYMBOL(sysctl_wmem_max); __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; /* Maximal space eaten by iovec or ancillary data plus some space */ int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); int sysctl_tstamp_allow_data __read_mostly = 1; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); /** * sk_set_memalloc - sets %SOCK_MEMALLOC * @sk: socket to set it on * * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. * It's the responsibility of the admin to adjust min_free_kbytes * to meet the requirements */ void sk_set_memalloc(struct sock *sk) { sock_set_flag(sk, SOCK_MEMALLOC); sk->sk_allocation |= __GFP_MEMALLOC; static_branch_inc(&memalloc_socks_key); } EXPORT_SYMBOL_GPL(sk_set_memalloc); void sk_clear_memalloc(struct sock *sk) { sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; static_branch_dec(&memalloc_socks_key); /* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward * progress of swapping. SOCK_MEMALLOC may be cleared while * it has rmem allocations due to the last swapfile being deactivated * but there is a risk that the socket is unusable due to exceeding * the rmem limits. Reclaim the reserves and obey rmem limits again. */ sk_mem_reclaim(sk); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); noreclaim_flag = memalloc_noreclaim_save(); ret = sk->sk_backlog_rcv(sk, skb); memalloc_noreclaim_restore(noreclaim_flag); return ret; } EXPORT_SYMBOL(__sk_backlog_rcv); static int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; if (timeo == MAX_SCHEDULE_TIMEOUT) { tv.tv_sec = 0; tv.tv_usec = 0; } else { tv.tv_sec = timeo / HZ; tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; } if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; *(struct old_timeval32 *)optval = tv32; return sizeof(tv32); } if (old_timeval) { struct __kernel_old_timeval old_tv; old_tv.tv_sec = tv.tv_sec; old_tv.tv_usec = tv.tv_usec; *(struct __kernel_old_timeval *)optval = old_tv; return sizeof(old_tv); } *(struct __kernel_sock_timeval *)optval = tv; return sizeof(tv); } static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, bool old_timeval) { struct __kernel_sock_timeval tv; if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32; if (optlen < sizeof(tv32)) return -EINVAL; if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) return -EFAULT; tv.tv_sec = tv32.tv_sec; tv.tv_usec = tv32.tv_usec; } else if (old_timeval) { struct __kernel_old_timeval old_tv; if (optlen < sizeof(old_tv)) return -EINVAL; if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) return -EFAULT; tv.tv_sec = old_tv.tv_sec; tv.tv_usec = old_tv.tv_usec; } else { if (optlen < sizeof(tv)) return -EINVAL; if (copy_from_sockptr(&tv, optval, sizeof(tv))) return -EFAULT; } if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; if (tv.tv_sec < 0) { static int warned __read_mostly; *timeo_p = 0; if (warned < 10 && net_ratelimit()) { warned++; pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", __func__, current->comm, task_pid_nr(current)); } return 0; } *timeo_p = MAX_SCHEDULE_TIMEOUT; if (tv.tv_sec == 0 && tv.tv_usec == 0) return 0; if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); return 0; } static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { case AF_UNSPEC: case AF_UNIX: return false; default: return true; } } static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { sk->sk_flags &= ~flags; if (sock_needs_netstamp(sk) && !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); } } int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; } skb->dev = NULL; skb_set_owner_r(skb, sk); /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ skb_dst_force(skb); spin_lock_irqsave(&list->lock, flags); sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(__sock_queue_rcv_skb); int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; err = sk_filter(sk, skb); if (err) return err; return __sock_queue_rcv_skb(sk, skb); } EXPORT_SYMBOL(sock_queue_rcv_skb); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted) { int rc = NET_RX_SUCCESS; if (sk_filter_trim_cap(sk, skb, trim_cap)) goto discard_and_relse; skb->dev = NULL; if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } if (nested) bh_lock_sock_nested(sk); else bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* * trylock + unlock semantics: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; } bh_unlock_sock(sk); out: if (refcounted) sock_put(sk); return rc; discard_and_relse: kfree_skb(skb); goto out; } EXPORT_SYMBOL(__sk_receive_skb); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_tx_queue_clear(sk); sk->sk_dst_pending_confirm = 0; RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(__sk_dst_check); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_dst_reset(sk); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(sk_dst_check); static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); /* Sorry... */ ret = -EPERM; if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; if (ifindex < 0) goto out; sk->sk_bound_dev_if = ifindex; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); sk_dst_reset(sk); ret = 0; out: #endif return ret; } int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) { int ret; if (lock_sk) lock_sock(sk); ret = sock_bindtoindex_locked(sk, ifindex); if (lock_sk) release_sock(sk); return ret; } EXPORT_SYMBOL(sock_bindtoindex); static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); char devname[IFNAMSIZ]; int index; ret = -EINVAL; if (optlen < 0) goto out; /* Bind this socket to a particular device like "eth0", * as specified in the passed interface name. If the * name is "" or the option length is zero the socket * is not bound. */ if (optlen > IFNAMSIZ - 1) optlen = IFNAMSIZ - 1; memset(devname, 0, sizeof(devname)); ret = -EFAULT; if (copy_from_sockptr(devname, optval, optlen)) goto out; index = 0; if (devname[0] != '\0') { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, devname); if (dev) index = dev->ifindex; rcu_read_unlock(); ret = -ENODEV; if (!dev) goto out; } return sock_bindtoindex(sk, index, true); out: #endif return ret; } static int sock_getbindtodevice(struct sock *sk, char __user *optval, int __user *optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); char devname[IFNAMSIZ]; if (sk->sk_bound_dev_if == 0) { len = 0; goto zero; } ret = -EINVAL; if (len < IFNAMSIZ) goto out; ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); if (ret) goto out; len = strlen(devname) + 1; ret = -EFAULT; if (copy_to_user(optval, devname, len)) goto out; zero: ret = -EFAULT; if (put_user(len, optlen)) goto out; ret = 0; out: #endif return ret; } bool sk_mc_loop(struct sock *sk) { if (dev_recursion_level()) return false; if (!sk) return true; switch (sk->sk_family) { case AF_INET: return inet_sk(sk)->mc_loop; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return inet6_sk(sk)->mc_loop; #endif } WARN_ON_ONCE(1); return true; } EXPORT_SYMBOL(sk_mc_loop); void sock_set_reuseaddr(struct sock *sk) { lock_sock(sk); sk->sk_reuse = SK_CAN_REUSE; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseaddr); void sock_set_reuseport(struct sock *sk) { lock_sock(sk); sk->sk_reuseport = true; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseport); void sock_no_linger(struct sock *sk) { lock_sock(sk); sk->sk_lingertime = 0; sock_set_flag(sk, SOCK_LINGER); release_sock(sk); } EXPORT_SYMBOL(sock_no_linger); void sock_set_priority(struct sock *sk, u32 priority) { lock_sock(sk); sk->sk_priority = priority; release_sock(sk); } EXPORT_SYMBOL(sock_set_priority); void sock_set_sndtimeo(struct sock *sk, s64 secs) { lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) sk->sk_sndtimeo = secs * HZ; else sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; release_sock(sk); } EXPORT_SYMBOL(sock_set_sndtimeo); static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) { if (val) { sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); sock_set_flag(sk, SOCK_RCVTSTAMP); sock_enable_timestamp(sk, SOCK_TIMESTAMP); } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); } } void sock_enable_timestamps(struct sock *sk) { lock_sock(sk); __sock_set_timestamps(sk, true, false, true); release_sock(sk); } EXPORT_SYMBOL(sock_enable_timestamps); void sock_set_keepalive(struct sock *sk) { lock_sock(sk); if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, true); sock_valbool_flag(sk, SOCK_KEEPOPEN, true); release_sock(sk); } EXPORT_SYMBOL(sock_set_keepalive); static void __sock_set_rcvbuf(struct sock *sk, int val) { /* Ensure val * 2 fits into an int, to prevent max_t() from treating it * as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; /* We double it on the way in to account for "struct sk_buff" etc. * overhead. Applications assume that the SO_RCVBUF setting they make * will allow that much actual data to be received on that socket. * * Applications are unaware that "struct sk_buff" and other overheads * allocate from the receive buffer during socket buffer allocation. * * And after considering the possible alternatives, returning the value * we actually used in getsockopt is the most desirable behavior. */ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); } void sock_set_rcvbuf(struct sock *sk, int val) { lock_sock(sk); __sock_set_rcvbuf(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_rcvbuf); static void __sock_set_mark(struct sock *sk, u32 val) { if (val != sk->sk_mark) { sk->sk_mark = val; sk_dst_reset(sk); } } void sock_set_mark(struct sock *sk, u32 val) { lock_sock(sk); __sock_set_mark(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_mark); /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ int sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock_txtime sk_txtime; struct sock *sk = sock->sk; int val; int valbool; struct linger ling; int ret = 0; /* * Options without arguments */ if (optname == SO_BINDTODEVICE) return sock_setbindtodevice(sk, optval, optlen); if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; lock_sock(sk); switch (optname) { case SO_DEBUG: if (val && !capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: sk->sk_reuseport = valbool; break; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: case SO_ERROR: ret = -ENOPROTOOPT; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); break; case SO_SNDBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ val = min_t(u32, val, sysctl_wmem_max); set_sndbuf: /* Ensure val * 2 fits into an int, to prevent max_t() * from treating it as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(sk->sk_sndbuf, max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; case SO_SNDBUFFORCE: if (!capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ if (val < 0) val = 0; goto set_sndbuf; case SO_RCVBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); break; case SO_RCVBUFFORCE: if (!capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ __sock_set_rcvbuf(sk, max(val, 0)); break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; case SO_OOBINLINE: sock_valbool_flag(sk, SOCK_URGINLINE, valbool); break; case SO_NO_CHECK: sk->sk_no_check_tx = valbool; break; case SO_PRIORITY: if ((val >= 0 && val <= 6) || ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) sk->sk_priority = val; else ret = -EPERM; break; case SO_LINGER: if (optlen < sizeof(ling)) { ret = -EINVAL; /* 1003.1g */ break; } if (copy_from_sockptr(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } if (!ling.l_onoff) sock_reset_flag(sk, SOCK_LINGER); else { #if (BITS_PER_LONG == 32) if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; else #endif sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; sock_set_flag(sk, SOCK_LINGER); } break; case SO_BSDCOMPAT: break; case SO_PASSCRED: if (valbool) set_bit(SOCK_PASSCRED, &sock->flags); else clear_bit(SOCK_PASSCRED, &sock->flags); break; case SO_TIMESTAMP_OLD: __sock_set_timestamps(sk, valbool, false, false); break; case SO_TIMESTAMP_NEW: __sock_set_timestamps(sk, valbool, true, false); break; case SO_TIMESTAMPNS_OLD: __sock_set_timestamps(sk, valbool, false, true); break; case SO_TIMESTAMPNS_NEW: __sock_set_timestamps(sk, valbool, true, true); break; case SO_TIMESTAMPING_NEW: case SO_TIMESTAMPING_OLD: if (val & ~SOF_TIMESTAMPING_MASK) { ret = -EINVAL; break; } if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) { ret = -EINVAL; break; } sk->sk_tskey = tcp_sk(sk)->snd_una; } else { sk->sk_tskey = 0; } } if (val & SOF_TIMESTAMPING_OPT_STATS && !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { ret = -EINVAL; break; } sk->sk_tsflags = val; sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); break; case SO_RCVLOWAT: if (val < 0) val = INT_MAX; if (sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); break; case SO_ATTACH_FILTER: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_attach_filter(&fprog, sk); break; } case SO_ATTACH_BPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_attach_bpf(ufd, sk); } break; case SO_ATTACH_REUSEPORT_CBPF: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_reuseport_attach_filter(&fprog, sk); break; } case SO_ATTACH_REUSEPORT_EBPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_reuseport_attach_bpf(ufd, sk); } break; case SO_DETACH_REUSEPORT_BPF: ret = reuseport_detach_prog(sk); break; case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; case SO_LOCK_FILTER: if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) ret = -EPERM; else sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); break; case SO_PASSSEC: if (valbool) set_bit(SOCK_PASSSEC, &sock->flags); else clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } __sock_set_mark(sk, val); break; case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; case SO_WIFI_STATUS: sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break; case SO_PEEK_OFF: if (sock->ops->set_peek_off) ret = sock->ops->set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; case SO_SELECT_ERR_QUEUE: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: /* allow unprivileged users to decrease the value */ if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) ret = -EPERM; else { if (val < 0) ret = -EINVAL; else WRITE_ONCE(sk->sk_ll_usec, val); } break; #endif case SO_MAX_PACING_RATE: { unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && copy_from_sockptr(&ulval, optval, sizeof(ulval))) { ret = -EFAULT; break; } if (ulval != ~0UL) cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); sk->sk_max_pacing_rate = ulval; sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); break; } case SO_INCOMING_CPU: WRITE_ONCE(sk->sk_incoming_cpu, val); break; case SO_CNX_ADVICE: if (val == 1) dst_negative_advice(sk); break; case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { if (!((sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) || (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; } if (!ret) { if (val < 0 || val > 1) ret = -EINVAL; else sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); } break; case SO_TXTIME: if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; break; } else if (copy_from_sockptr(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; break; } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { ret = -EINVAL; break; } /* CLOCK_MONOTONIC is only used by sch_fq, and this packet * scheduler has enough safe guards. */ if (sk_txtime.clockid != CLOCK_MONOTONIC && !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } sock_valbool_flag(sk, SOCK_TXTIME, true); sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); sk->sk_txtime_report_errors = !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); break; case SO_BINDTOIFINDEX: ret = sock_bindtoindex_locked(sk, val); break; default: ret = -ENOPROTOOPT; break; } release_sock(sk); return ret; } EXPORT_SYMBOL(sock_setsockopt); static const struct cred *sk_get_peer_cred(struct sock *sk) { const struct cred *cred; spin_lock(&sk->sk_peer_lock); cred = get_cred(sk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); return cred; } static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) { ucred->pid = pid_vnr(pid); ucred->uid = ucred->gid = -1; if (cred) { struct user_namespace *current_ns = current_user_ns(); ucred->uid = from_kuid_munged(current_ns, cred->euid); ucred->gid = from_kgid_munged(current_ns, cred->egid); } } static int groups_to_user(gid_t __user *dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; for (i = 0; i < src->ngroups; i++) if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) return -EFAULT; return 0; } int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; union { int val; u64 val64; unsigned long ulval; struct linger ling; struct old_timeval32 tm32; struct __kernel_old_timeval tm; struct __kernel_sock_timeval stm; struct sock_txtime txtime; } v; int lv = sizeof(int); int len; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; memset(&v, 0, sizeof(v)); switch (optname) { case SO_DEBUG: v.val = sock_flag(sk, SOCK_DBG); break; case SO_DONTROUTE: v.val = sock_flag(sk, SOCK_LOCALROUTE); break; case SO_BROADCAST: v.val = sock_flag(sk, SOCK_BROADCAST); break; case SO_SNDBUF: v.val = sk->sk_sndbuf; break; case SO_RCVBUF: v.val = sk->sk_rcvbuf; break; case SO_REUSEADDR: v.val = sk->sk_reuse; break; case SO_REUSEPORT: v.val = sk->sk_reuseport; break; case SO_KEEPALIVE: v.val = sock_flag(sk, SOCK_KEEPOPEN); break; case SO_TYPE: v.val = sk->sk_type; break; case SO_PROTOCOL: v.val = sk->sk_protocol; break; case SO_DOMAIN: v.val = sk->sk_family; break; case SO_ERROR: v.val = -sock_error(sk); if (v.val == 0) v.val = xchg(&sk->sk_err_soft, 0); break; case SO_OOBINLINE: v.val = sock_flag(sk, SOCK_URGINLINE); break; case SO_NO_CHECK: v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: v.val = sk->sk_priority; break; case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); v.ling.l_linger = sk->sk_lingertime / HZ; break; case SO_BSDCOMPAT: break; case SO_TIMESTAMP_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && !sock_flag(sk, SOCK_TSTAMP_NEW) && !sock_flag(sk, SOCK_RCVTSTAMPNS); break; case SO_TIMESTAMPNS_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMP_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPNS_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPING_OLD: v.val = sk->sk_tsflags; break; case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); break; case SO_RCVLOWAT: v.val = sk->sk_rcvlowat; break; case SO_SNDLOWAT: v.val = 1; break; case SO_PASSCRED: v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; case SO_PEERCRED: { struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); spin_unlock(&sk->sk_peer_lock); if (copy_to_user(optval, &peercred, len)) return -EFAULT; goto lenout; } case SO_PEERGROUPS: { const struct cred *cred; int ret, n; cred = sk_get_peer_cred(sk); if (!cred) return -ENODATA; n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); put_cred(cred); return put_user(len, optlen) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); ret = groups_to_user((gid_t __user *)optval, cred->group_info); put_cred(cred); if (ret) return ret; goto lenout; } case SO_PEERNAME: { char address[128]; lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; if (copy_to_user(optval, address, len)) return -EFAULT; goto lenout; } /* Dubious BSD thing... Probably nobody even uses it, but * the UNIX standard wants it for whatever reason... -DaveM */ case SO_ACCEPTCONN: v.val = sk->sk_state == TCP_LISTEN; break; case SO_PASSSEC: v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); break; case SO_PEERSEC: return security_socket_getpeersec_stream(sock, optval, optlen, len); case SO_MARK: v.val = sk->sk_mark; break; case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; case SO_WIFI_STATUS: v.val = sock_flag(sk, SOCK_WIFI_STATUS); break; case SO_PEEK_OFF: if (!sock->ops->set_peek_off) return -EOPNOTSUPP; v.val = sk->sk_peek_off; break; case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); break; case SO_BINDTODEVICE: return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); if (len < 0) return len; goto lenout; case SO_LOCK_FILTER: v.val = sock_flag(sk, SOCK_FILTER_LOCKED); break; case SO_BPF_EXTENSIONS: v.val = bpf_tell_extensions(); break; case SO_SELECT_ERR_QUEUE: v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); break; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: v.val = sk->sk_ll_usec; break; #endif case SO_MAX_PACING_RATE: if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { lv = sizeof(v.ulval); v.ulval = sk->sk_max_pacing_rate; } else { /* 32bit version */ v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); } break; case SO_INCOMING_CPU: v.val = READ_ONCE(sk->sk_incoming_cpu); break; case SO_MEMINFO: { u32 meminfo[SK_MEMINFO_VARS]; sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); if (copy_to_user(optval, &meminfo, len)) return -EFAULT; goto lenout; } #ifdef CONFIG_NET_RX_BUSY_POLL case SO_INCOMING_NAPI_ID: v.val = READ_ONCE(sk->sk_napi_id); /* aggregate non-NAPI IDs down to 0 */ if (v.val < MIN_NAPI_ID) v.val = 0; break; #endif case SO_COOKIE: lv = sizeof(u64); if (len < lv) return -EINVAL; v.val64 = sock_gen_cookie(sk); break; case SO_ZEROCOPY: v.val = sock_flag(sk, SOCK_ZEROCOPY); break; case SO_TXTIME: lv = sizeof(v.txtime); v.txtime.clockid = sk->sk_clockid; v.txtime.flags |= sk->sk_txtime_deadline_mode ? SOF_TXTIME_DEADLINE_MODE : 0; v.txtime.flags |= sk->sk_txtime_report_errors ? SOF_TXTIME_REPORT_ERRORS : 0; break; case SO_BINDTOIFINDEX: v.val = sk->sk_bound_dev_if; break; default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). */ return -ENOPROTOOPT; } if (len > lv) len = lv; if (copy_to_user(optval, &v, len)) return -EFAULT; lenout: if (put_user(len, optlen)) return -EFAULT; return 0; } /* * Initialize an sk_lock. * * (We also register the sk_lock with the lock validator.) */ static inline void sock_lock_init(struct sock *sk) { if (sk->sk_kern_sock) sock_lock_init_class_and_name( sk, af_family_kern_slock_key_strings[sk->sk_family], af_family_kern_slock_keys + sk->sk_family, af_family_kern_key_strings[sk->sk_family], af_family_kern_keys + sk->sk_family); else sock_lock_init_class_and_name( sk, af_family_slock_key_strings[sk->sk_family], af_family_slock_keys + sk->sk_family, af_family_key_strings[sk->sk_family], af_family_keys + sk->sk_family); } /* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, * even temporarly, because of RCU lookups. sk_node should also be left as is. * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end */ static void sock_copy(struct sock *nsk, const struct sock *osk) { const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); #endif } static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { struct sock *sk; struct kmem_cache *slab; slab = prot->slab; if (slab != NULL) { sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; if (want_init_on_alloc(priority)) sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { if (security_sk_alloc(sk, family, priority)) goto out_free; if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); } return sk; out_free_sec: security_sk_free(sk); out_free: if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); return NULL; } static void sk_prot_free(struct proto *prot, struct sock *sk) { struct kmem_cache *slab; struct module *owner; owner = prot->owner; slab = prot->slab; cgroup_sk_free(&sk->sk_cgrp_data); mem_cgroup_sk_free(sk); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); module_put(owner); } /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { struct sock *sk; sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) { sk->sk_family = family; /* * See comment in struct sock definition to understand * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net(net); sock_inuse_add(net, 1); } sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); sk_tx_queue_clear(sk); } return sk; } EXPORT_SYMBOL(sk_alloc); /* Sockets having SOCK_RCU_FREE will call this function after one RCU * grace period. This is the case for UDP sockets and TCP listeners. */ static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) sk->sk_destruct(sk); filter = rcu_dereference_check(sk->sk_filter, refcount_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); #ifdef CONFIG_BPF_SYSCALL bpf_sk_storage_free(sk); #endif if (atomic_read(&sk->sk_omem_alloc)) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; } /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); if (likely(sk->sk_net_refcnt)) put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); } void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); if (rcu_access_pointer(sk->sk_reuseport_cb)) { reuseport_detach_sock(sk); use_call_rcu = true; } if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu); } static void __sk_free(struct sock *sk) { if (likely(sk->sk_net_refcnt)) sock_inuse_add(sock_net(sk), -1); if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) sock_diag_broadcast_destroy(sk); else sk_destruct(sk); } void sk_free(struct sock *sk) { /* * We subtract one from sk_wmem_alloc and can know if * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ if (refcount_dec_and_test(&sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sk_free); static void sk_init_common(struct sock *sk) { skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_receive_queue.lock, af_rlock_keys + sk->sk_family, af_family_rlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_write_queue.lock, af_wlock_keys + sk->sk_family, af_family_wlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_error_queue.lock, af_elock_keys + sk->sk_family, af_family_elock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); } /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) */ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct proto *prot = READ_ONCE(sk->sk_prot); struct sock *newsk; bool is_charged = true; newsk = sk_prot_alloc(prot, priority, sk->sk_family); if (newsk != NULL) { struct sk_filter *filter; sock_copy(newsk, sk); newsk->sk_prot_creator = prot; /* SANITY */ if (likely(newsk->sk_net_refcnt)) get_net(sock_net(newsk)); sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); /* * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ refcount_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); sk_init_common(newsk); newsk->sk_dst_cache = NULL; newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; cgroup_sk_clone(&newsk->sk_cgrp_data); rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); RCU_INIT_POINTER(newsk->sk_filter, filter); rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new * socket if we couldn't charge it in the first place * as otherwise we uncharge the parent's filter. */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); sk_free_unlock_clone(newsk); newsk = NULL; goto out; } RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); if (bpf_sk_storage_clone(sk, newsk)) { sk_free_unlock_clone(newsk); newsk = NULL; goto out; } /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. */ if (sk_user_data_is_nocopy(newsk)) newsk->sk_user_data = NULL; newsk->sk_err = 0; newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); if (likely(newsk->sk_net_refcnt)) sock_inuse_add(sock_net(newsk), 1); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&newsk->sk_refcnt, 2); /* * Increment the counter in the same struct proto as the master * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that * is the same as sk->sk_prot->socks, as this field was copied * with memcpy). * * This _changes_ the previous behaviour, where * tcp_create_openreq_child always was incrementing the * equivalent to tcp_prot->socks (inet_sock_nr), so this have * to be taken into account in all callers. -acme */ sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); sk_tx_queue_clear(newsk); RCU_INIT_POINTER(newsk->sk_wq, NULL); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); } out: return newsk; } EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_free_unlock_clone(struct sock *sk) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ sk->sk_destruct = NULL; bh_unlock_sock(sk); sk_free(sk); } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; sk->sk_route_caps &= ~sk->sk_route_nocaps; if (sk_can_gso(sk)) { if (dst->header_len && !xfrm_dst_offload_ok(dst)) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; max_segs = max_t(u32, dst->dev->gso_max_segs, 1); } } sk->sk_gso_max_segs = max_segs; } EXPORT_SYMBOL_GPL(sk_setup_caps); /* * Simple resource managers for sockets. */ /* * Write buffer destructor automatically called from kfree_skb. */ void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { /* * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call */ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); sk->sk_write_space(sk); len = 1; } /* * if sk_wmem_alloc reaches 0, we must finish what sk_free() * could not do because of in-flight packets */ if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sock_wfree); /* This variant of sock_wfree() is used by TCP, * since it sets SOCK_USE_WRITE_QUEUE. */ void __sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) __sk_free(sk); } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; #ifdef CONFIG_INET if (unlikely(!sk_fullsock(sk))) { skb->destructor = sock_edemux; sock_hold(sk); return; } #endif skb->destructor = sock_wfree; skb_set_hash_from_sk(skb, sk); /* * We used to take a refcount on sk, but following operation * is enough to guarantee sk_free() wont free this sock until * all in-flight packets are completed */ refcount_add(skb->truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { #ifdef CONFIG_TLS_DEVICE /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ if (skb->decrypted) return false; #endif return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } /* This helper is used by netem, as it can hold packets in its * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. * But we also want to keep skb->sk set because some packet schedulers * rely on it (sch_fq for example). */ void skb_orphan_partial(struct sk_buff *skb) { if (skb_is_tcp_pure_ack(skb)) return; if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) return; skb_orphan(skb); } EXPORT_SYMBOL(skb_orphan_partial); /* * Read buffer destructor automatically called from kfree_skb. */ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; atomic_sub(len, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, len); } EXPORT_SYMBOL(sock_rfree); /* * Buffer destructor for skbs that are not used directly in read or write * path, e.g. for error handler skbs. Automatically called from kfree_skb. */ void sock_efree(struct sk_buff *skb) { sock_put(skb->sk); } EXPORT_SYMBOL(sock_efree); /* Buffer destructor for prefetch/receive path where reference count may * not be held, e.g. for listen sockets. */ #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { if (sk_is_refcounted(skb->sk)) sock_gen_put(skb->sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ kuid_t sock_i_uid(struct sock *sk) { kuid_t uid; read_lock_bh(&sk->sk_callback_lock); uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; read_unlock_bh(&sk->sk_callback_lock); return uid; } EXPORT_SYMBOL(sock_i_uid); unsigned long sock_i_ino(struct sock *sk) { unsigned long ino; read_lock_bh(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; read_unlock_bh(&sk->sk_callback_lock); return ino; } EXPORT_SYMBOL(sock_i_ino); /* * Allocate a skb from the socket's send buffer. */ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { if (force || refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { struct sk_buff *skb = alloc_skb(size, priority); if (skb) { skb_set_owner_w(skb, sk); return skb; } } return NULL; } EXPORT_SYMBOL(sock_wmalloc); static void sock_ofree(struct sk_buff *skb) { struct sock *sk = skb->sk; atomic_sub(skb->truesize, &sk->sk_omem_alloc); } struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority) { struct sk_buff *skb; /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > sysctl_optmem_max) return NULL; skb = alloc_skb(size, priority); if (!skb) return NULL; atomic_add(skb->truesize, &sk->sk_omem_alloc); skb->sk = sk; skb->destructor = sock_ofree; return skb; } /* * Allocate a memory block from the socket's option memory buffer. */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { if ((unsigned int)size <= sysctl_optmem_max && atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. */ atomic_add(size, &sk->sk_omem_alloc); mem = kmalloc(size, priority); if (mem) return mem; atomic_sub(size, &sk->sk_omem_alloc); } return NULL; } EXPORT_SYMBOL(sock_kmalloc); /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. */ static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, const bool nullify) { if (WARN_ON_ONCE(!mem)) return; if (nullify) kfree_sensitive(mem); else kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } void sock_kfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, false); } EXPORT_SYMBOL(sock_kfree_s); void sock_kzfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, true); } EXPORT_SYMBOL(sock_kzfree_s); /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; if (signal_pending(current)) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; if (sk->sk_shutdown & SEND_SHUTDOWN) break; if (sk->sk_err) break; timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(sk), &wait); return timeo; } /* * Generic send/receive buffer handlers */ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order) { struct sk_buff *skb; long timeo; int err; timeo = sock_sndtimeo(sk, noblock); for (;;) { err = sock_error(sk); if (err != 0) goto failure; err = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) break; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) goto failure; if (signal_pending(current)) goto interrupted; timeo = sock_wait_for_wmem(sk, timeo); } skb = alloc_skb_with_frags(header_len, data_len, max_page_order, errcode, sk->sk_allocation); if (skb) skb_set_owner_w(skb, sk); return skb; interrupted: err = sock_intr_errno(timeo); failure: *errcode = err; return NULL; } EXPORT_SYMBOL(sock_alloc_send_pskb); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } EXPORT_SYMBOL(sock_alloc_send_skb); int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { u32 tsflags; switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; case SO_TIMESTAMPING_OLD: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; tsflags = *(u32 *)CMSG_DATA(cmsg); if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) return -EINVAL; sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; case SCM_TXTIME: if (!sock_flag(sk, SOCK_TXTIME)) return -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) return -EINVAL; sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL(__sock_cmsg_send); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc) { struct cmsghdr *cmsg; int ret; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; ret = __sock_cmsg_send(sk, msg, cmsg, sockc); if (ret) return ret; } return 0; } EXPORT_SYMBOL(sock_cmsg_send); static void sk_enter_memory_pressure(struct sock *sk) { if (!sk->sk_prot->enter_memory_pressure) return; sk->sk_prot->enter_memory_pressure(sk); } static void sk_leave_memory_pressure(struct sock *sk) { if (sk->sk_prot->leave_memory_pressure) { sk->sk_prot->leave_memory_pressure(sk); } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure; if (memory_pressure && READ_ONCE(*memory_pressure)) WRITE_ONCE(*memory_pressure, 0); } } #define SKB_FRAG_PAGE_ORDER get_order(32768) DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** * skb_page_frag_refill - check that a page_frag contains enough room * @sz: minimum size of the fragment we want to get * @pfrag: pointer to page_frag * @gfp: priority for memory allocation * * Note: While this allocator tries to use high order pages, there is * no guarantee that allocations succeed. Therefore, @sz MUST be * less or equal than PAGE_SIZE. */ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { if (pfrag->page) { if (page_ref_count(pfrag->page) == 1) { pfrag->offset = 0; return true; } if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER && !static_branch_unlikely(&net_high_order_alloc_disable_key)) { /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; return true; } } pfrag->page = alloc_page(gfp); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE; return true; } return false; } EXPORT_SYMBOL(skb_page_frag_refill); bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return false; } EXPORT_SYMBOL(sk_page_frag_refill); static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_bh(&sk->sk_lock.slock); schedule(); spin_lock_bh(&sk->sk_lock.slock); if (!sock_owned_by_user(sk)) break; } finish_wait(&sk->sk_lock.wq, &wait); } void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { struct sk_buff *skb, *next; while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; spin_unlock_bh(&sk->sk_lock.slock); do { next = skb->next; prefetch(next); WARN_ON_ONCE(skb_dst_is_noref(skb)); skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); cond_resched(); skb = next; } while (skb != NULL); spin_lock_bh(&sk->sk_lock.slock); } /* * Doing the zeroing here guarantee we can not loop forever * while a wild producer attempts to flood us. */ sk->sk_backlog.len = 0; } void __sk_flush_backlog(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); __release_sock(sk); spin_unlock_bh(&sk->sk_lock.slock); } /** * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on * @timeo: for how long * @skb: last skb seen on sk_receive_queue * * Now socket state including sk->sk_err is changed only under lock, * hence we may omit checks after joining wait queue. * We check receive queue before schedule() only as optimization; * it is very likely that release_sock() added new data. */ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return rc; } EXPORT_SYMBOL(sk_wait_data); /** * __sk_mem_raise_allocated - increase memory_allocated * @sk: socket * @size: memory size to allocate * @amt: pages to allocate * @kind: allocation type * * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct proto *prot = sk->sk_prot; long allocated = sk_memory_allocated_add(sk, amt); bool charged = true; if (mem_cgroup_sockets_enabled && sk->sk_memcg && !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) goto suppress_allocation; /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } /* Under pressure. */ if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); /* Over hard limit. */ if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) return 1; } else { /* SK_MEM_SEND */ int wmem0 = sk_get_wmem0(sk, prot); if (sk->sk_type == SOCK_STREAM) { if (sk->sk_wmem_queued < wmem0) return 1; } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { return 1; } } if (sk_has_memory_pressure(sk)) { u64 alloc; if (!sk_under_memory_pressure(sk)) return 1; alloc = sk_sockets_allocated_read_positive(sk); if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + atomic_read(&sk->sk_rmem_alloc) + sk->sk_forward_alloc)) return 1; } suppress_allocation: if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { sk_stream_moderate_sndbuf(sk); /* Fail only if socket is _under_ its sndbuf. * In this case we cannot block, so that we have to fail. */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) return 1; } if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) trace_sock_exceed_buf_limit(sk, prot, allocated, kind); sk_memory_allocated_sub(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); return 0; } EXPORT_SYMBOL(__sk_mem_raise_allocated); /** * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated * @sk: socket * @size: memory size to allocate * @kind: allocation type * * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means * rmem allocation. This function assumes that protocols which have * memory_pressure use sk_wmem_queued as write buffer accounting. */ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; return ret; } EXPORT_SYMBOL(__sk_mem_schedule); /** * __sk_mem_reduce_allocated - reclaim memory_allocated * @sk: socket * @amount: number of quanta * * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { sk_memory_allocated_sub(sk, amount); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } EXPORT_SYMBOL(__sk_mem_reduce_allocated); /** * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated * @sk: socket * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) */ void __sk_mem_reclaim(struct sock *sk, int amount) { amount >>= SK_MEM_QUANTUM_SHIFT; sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) { sk->sk_peek_off = val; return 0; } EXPORT_SYMBOL_GPL(sk_set_peek_off); /* * Set of default routines for initialising struct proto_ops when * the protocol does not support a particular function. In certain * cases where it makes no sense for a protocol to have a "do nothing" * function, some default processing is provided. */ int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_bind); int sock_no_connect(struct socket *sock, struct sockaddr *saddr, int len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_connect); int sock_no_socketpair(struct socket *sock1, struct socket *sock2) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_socketpair); int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_accept); int sock_no_getname(struct socket *sock, struct sockaddr *saddr, int peer) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_getname); int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_ioctl); int sock_no_listen(struct socket *sock, int backlog) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_listen); int sock_no_shutdown(struct socket *sock, int how) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_shutdown); int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg_locked); int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_recvmsg); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { /* Mirror missing mmap method error code */ return -ENODEV; } EXPORT_SYMBOL(sock_no_mmap); /* * When a file is received (via SCM_RIGHTS, etc), we must bump the * various sock-based usage counts. */ void __receive_sock(struct file *file) { struct socket *sock; int error; /* * The resulting value of "error" is ignored here since we only * need to take action when the file is a socket and testing * "sock" for NULL is sufficient. */ sock = sock_from_file(file, &error); if (sock) { sock_update_netprioidx(&sock->sk->sk_cgrp_data); sock_update_classid(&sock->sk->sk_cgrp_data); } } ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { ssize_t res; struct msghdr msg = {.msg_flags = flags}; struct kvec iov; char *kaddr = kmap(page); iov.iov_base = kaddr + offset; iov.iov_len = size; res = kernel_sendmsg(sock, &msg, &iov, 1, size); kunmap(page); return res; } EXPORT_SYMBOL(sock_no_sendpage); ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags) { ssize_t res; struct msghdr msg = {.msg_flags = flags}; struct kvec iov; char *kaddr = kmap(page); iov.iov_base = kaddr + offset; iov.iov_len = size; res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); kunmap(page); return res; } EXPORT_SYMBOL(sock_no_sendpage_locked); /* * Default Socket Callbacks */ static void sock_def_wakeup(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static void sock_def_error_report(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } void sock_def_readable(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } static void sock_def_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } static void sock_def_destruct(struct sock *sk) { } void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) if (send_sigurg(&sk->sk_socket->file->f_owner)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); void sk_reset_timer(struct sock *sk, struct timer_list* timer, unsigned long expires) { if (!mod_timer(timer, expires)) sock_hold(sk); } EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { if (del_timer(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) { if (del_timer_sync(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer_sync); void sock_init_data(struct socket *sock, struct sock *sk) { sk_init_common(sk); sk->sk_send_head = NULL; timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; sk->sk_state = TCP_CLOSE; sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); if (sock) { sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; sk->sk_uid = SOCK_INODE(sock)->i_uid; } else { RCU_INIT_POINTER(sk->sk_wq, NULL); sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); } rwlock_init(&sk->sk_callback_lock); if (sk->sk_kern_sock) lockdep_set_class_and_name( &sk->sk_callback_lock, af_kern_callback_keys + sk->sk_family, af_family_kern_clock_key_strings[sk->sk_family]); else lockdep_set_class_and_name( &sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; sk->sk_error_report = sock_def_error_report; sk->sk_destruct = sock_def_destruct; sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; spin_lock_init(&sk->sk_peer_lock); sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; #if BITS_PER_LONG==32 seqlock_init(&sk->sk_stamp_seq); #endif atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; sk->sk_ll_usec = sysctl_net_busy_read; #endif sk->sk_max_pacing_rate = ~0UL; sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_lock.owned) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock(&sk->sk_lock.slock); /* * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); local_bh_enable(); } EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); /* Warning : release_cb() might need to release sk ownership, * ie call sock_release_ownership(sk) before us. */ if (sk->sk_prot->release_cb) sk->sk_prot->release_cb(sk); sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(release_sock); /** * lock_sock_fast - fast version of lock_sock * @sk: socket * * This version should be used for very small section, where process wont block * return false if fast path is taken: * * sk_lock.slock locked, owned = 0, BH disabled * * return true if slow path is taken: * * sk_lock.slock unlocked, owned = 1, BH enabled */ bool lock_sock_fast(struct sock *sk) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (!sk->sk_lock.owned) /* * Note : We must disable BH */ return false; __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock(&sk->sk_lock.slock); /* * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); local_bh_enable(); return true; } EXPORT_SYMBOL(lock_sock_fast); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32) { struct sock *sk = sock->sk; struct timespec64 ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); ts = ktime_to_timespec64(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); sock_write_timestamp(sk, kt); ts = ktime_to_timespec64(kt); } if (timeval) ts.tv_nsec /= 1000; #ifdef CONFIG_COMPAT_32BIT_TIME if (time32) return put_old_timespec32(&ts, userstamp); #endif #ifdef CONFIG_SPARC64 /* beware of padding in sparc64 timeval */ if (timeval && !in_compat_syscall()) { struct __kernel_old_timeval __user tv = { .tv_sec = ts.tv_sec, .tv_usec = ts.tv_nsec, }; if (copy_to_user(userstamp, &tv, sizeof(tv))) return -EFAULT; return 0; } #endif return put_timespec64(&ts, userstamp); } EXPORT_SYMBOL(sock_gettstamp); void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; sock_set_flag(sk, flag); /* * we just set one of the two flags which require net * time stamping, but time stamping might have been on * already because of the other one */ if (sock_needs_netstamp(sk) && !(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { struct sock_exterr_skb *serr; struct sk_buff *skb; int copied, err; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); msg->msg_flags |= MSG_ERRQUEUE; err = copied; out_free_skb: kfree_skb(skb); out: return err; } EXPORT_SYMBOL(sock_recv_errqueue); /* * Get a socket option on an socket. * * FIX: POSIX 1003.1g is very ambiguous here. It states that * asynchronous errors should be reported by getsockopt. We assume * this means if you specify SO_ERROR (otherwise whats the point of it). */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; } EXPORT_SYMBOL(sock_common_recvmsg); /* * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); void sk_common_release(struct sock *sk) { if (sk->sk_prot->destroy) sk->sk_prot->destroy(sk); /* * Observation: when sk_common_release is called, processes have * no access to socket. But net still has. * Step one, detach it from networking: * * A. Remove from hash tables. */ sk->sk_prot->unhash(sk); /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and * did hash table lookup before we unhashed socket. They will achieve * receive queue and will be purged by socket destructor. * * Also we still have packets pending on receive queue and probably, * our own packets waiting in device queues. sock_destroy will drain * receive queue, but transmitted packets will delay socket destruction * until the last reference will be released. */ sock_orphan(sk); xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); sock_put(sk); } EXPORT_SYMBOL(sk_common_release); void sk_get_meminfo(const struct sock *sk, u32 *mem) { memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { int val[PROTO_INUSE_NR]; }; static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add); int sock_prot_inuse_get(struct net *net, struct proto *prot) { int cpu, idx = prot->inuse_idx; int res = 0; for_each_possible_cpu(cpu) res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; return res >= 0 ? res : 0; } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); static void sock_inuse_add(struct net *net, int val) { this_cpu_add(*net->core.sock_inuse, val); } int sock_inuse_get(struct net *net) { int cpu, res = 0; for_each_possible_cpu(cpu) res += *per_cpu_ptr(net->core.sock_inuse, cpu); return res; } EXPORT_SYMBOL_GPL(sock_inuse_get); static int __net_init sock_inuse_init_net(struct net *net) { net->core.prot_inuse = alloc_percpu(struct prot_inuse); if (net->core.prot_inuse == NULL) return -ENOMEM; net->core.sock_inuse = alloc_percpu(int); if (net->core.sock_inuse == NULL) goto out; return 0; out: free_percpu(net->core.prot_inuse); return -ENOMEM; } static void __net_exit sock_inuse_exit_net(struct net *net) { free_percpu(net->core.prot_inuse); free_percpu(net->core.sock_inuse); } static struct pernet_operations net_inuse_ops = { .init = sock_inuse_init_net, .exit = sock_inuse_exit_net, }; static __init int net_inuse_init(void) { if (register_pernet_subsys(&net_inuse_ops)) panic("Cannot initialize net inuse counters"); return 0; } core_initcall(net_inuse_init); static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } set_bit(prot->inuse_idx, proto_inuse_idx); return 0; } static void release_proto_idx(struct proto *prot) { if (prot->inuse_idx != PROTO_INUSE_NR - 1) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else static inline int assign_proto_idx(struct proto *prot) { return 0; } static inline void release_proto_idx(struct proto *prot) { } static void sock_inuse_add(struct net *net, int val) { } #endif static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) { if (!twsk_prot) return; kfree(twsk_prot->twsk_slab_name); twsk_prot->twsk_slab_name = NULL; kmem_cache_destroy(twsk_prot->twsk_slab); twsk_prot->twsk_slab = NULL; } static void req_prot_cleanup(struct request_sock_ops *rsk_prot) { if (!rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; kmem_cache_destroy(rsk_prot->slab); rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) { struct request_sock_ops *rsk_prot = prot->rsk_prot; if (!rsk_prot) return 0; rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); if (!rsk_prot->slab_name) return -ENOMEM; rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", prot->name); return -ENOMEM; } return 0; } int proto_register(struct proto *prot, int alloc_slab) { int ret = -ENOBUFS; if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | prot->slab_flags, prot->useroffset, prot->usersize, NULL); if (prot->slab == NULL) { pr_crit("%s: Can't create sock SLAB cache!\n", prot->name); goto out; } if (req_prot_init(prot)) goto out_free_request_sock_slab; if (prot->twsk_prot != NULL) { prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); if (prot->twsk_prot->twsk_slab_name == NULL) goto out_free_request_sock_slab; prot->twsk_prot->twsk_slab = kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab; } } mutex_lock(&proto_list_mutex); ret = assign_proto_idx(prot); if (ret) { mutex_unlock(&proto_list_mutex); goto out_free_timewait_sock_slab; } list_add(&prot->node, &proto_list); mutex_unlock(&proto_list_mutex); return ret; out_free_timewait_sock_slab: if (alloc_slab && prot->twsk_prot) tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { req_prot_cleanup(prot->rsk_prot); kmem_cache_destroy(prot->slab); prot->slab = NULL; } out: return ret; } EXPORT_SYMBOL(proto_register); void proto_unregister(struct proto *prot) { mutex_lock(&proto_list_mutex); release_proto_idx(prot); list_del(&prot->node); mutex_unlock(&proto_list_mutex); kmem_cache_destroy(prot->slab); prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); tw_prot_cleanup(prot->twsk_prot); } EXPORT_SYMBOL(proto_unregister); int sock_load_diag_module(int family, int protocol) { if (!protocol) { if (!sock_is_registered(family)) return -ENOENT; return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family); } #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family, protocol); } EXPORT_SYMBOL(sock_load_diag_module); #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) __acquires(proto_list_mutex) { mutex_lock(&proto_list_mutex); return seq_list_start_head(&proto_list, *pos); } static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &proto_list, pos); } static void proto_seq_stop(struct seq_file *seq, void *v) __releases(proto_list_mutex) { mutex_unlock(&proto_list_mutex); } static char proto_method_implemented(const void *method) { return method == NULL ? 'n' : 'y'; } static long sock_prot_memory_allocated(struct proto *proto) { return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; } static const char *sock_prot_memory_pressure(struct proto *proto) { return proto->memory_pressure != NULL ? proto_memory_pressure(proto) ? "yes" : "no" : "NI"; } static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), sock_prot_memory_allocated(proto), sock_prot_memory_pressure(proto), proto->max_header, proto->slab == NULL ? "no" : "yes", module_name(proto->owner), proto_method_implemented(proto->close), proto_method_implemented(proto->connect), proto_method_implemented(proto->disconnect), proto_method_implemented(proto->accept), proto_method_implemented(proto->ioctl), proto_method_implemented(proto->init), proto_method_implemented(proto->destroy), proto_method_implemented(proto->shutdown), proto_method_implemented(proto->setsockopt), proto_method_implemented(proto->getsockopt), proto_method_implemented(proto->sendmsg), proto_method_implemented(proto->recvmsg), proto_method_implemented(proto->sendpage), proto_method_implemented(proto->bind), proto_method_implemented(proto->backlog_rcv), proto_method_implemented(proto->hash), proto_method_implemented(proto->unhash), proto_method_implemented(proto->get_port), proto_method_implemented(proto->enter_memory_pressure)); } static int proto_seq_show(struct seq_file *seq, void *v) { if (v == &proto_list) seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", "protocol", "size", "sockets", "memory", "press", "maxhdr", "slab", "module", "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); else proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; } static const struct seq_operations proto_seq_ops = { .start = proto_seq_start, .next = proto_seq_next, .stop = proto_seq_stop, .show = proto_seq_show, }; static __net_init int proto_init_net(struct net *net) { if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } static __net_exit void proto_exit_net(struct net *net) { remove_proc_entry("protocols", net->proc_net); } static __net_initdata struct pernet_operations proto_net_ops = { .init = proto_init_net, .exit = proto_exit_net, }; static int __init proto_init(void) { return register_pernet_subsys(&proto_net_ops); } subsys_initcall(proto_init); #endif /* PROC_FS */ #ifdef CONFIG_NET_RX_BUSY_POLL bool sk_busy_loop_end(void *p, unsigned long start_time) { struct sock *sk = p; return !skb_queue_empty_lockless(&sk->sk_receive_queue) || sk_busy_loop_timeout(sk, start_time); } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) { if (!sk->sk_prot->bind_add) return -EOPNOTSUPP; return sk->sk_prot->bind_add(sk, addr, addr_len); } EXPORT_SYMBOL(sock_bind_add);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __IPC_NAMESPACE_H__ #define __IPC_NAMESPACE_H__ #include <linux/err.h> #include <linux/idr.h> #include <linux/rwsem.h> #include <linux/notifier.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/refcount.h> #include <linux/rhashtable-types.h> struct user_namespace; struct ipc_ids { int in_use; unsigned short seq; struct rw_semaphore rwsem; struct idr ipcs_idr; int max_idx; int last_idx; /* For wrap around detection */ #ifdef CONFIG_CHECKPOINT_RESTORE int next_id; #endif struct rhashtable key_ht; }; struct ipc_namespace { refcount_t count; struct ipc_ids ids[3]; int sem_ctls[4]; int used_sems; unsigned int msg_ctlmax; unsigned int msg_ctlmnb; unsigned int msg_ctlmni; atomic_t msg_bytes; atomic_t msg_hdrs; size_t shm_ctlmax; size_t shm_ctlall; unsigned long shm_tot; int shm_ctlmni; /* * Defines whether IPC_RMID is forced for _all_ shm segments regardless * of shmctl() */ int shm_rmid_forced; struct notifier_block ipcns_nb; /* The kern_mount of the mqueuefs sb. We take a ref on it */ struct vfsmount *mq_mnt; /* # queues in this ns, protected by mq_lock */ unsigned int mq_queues_count; /* next fields are set through sysctl */ unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */ unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ unsigned int mq_msg_default; unsigned int mq_msgsize_default; /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; struct ucounts *ucounts; struct llist_node mnt_llist; struct ns_common ns; } __randomize_layout; extern struct ipc_namespace init_ipc_ns; extern spinlock_t mq_lock; #ifdef CONFIG_SYSVIPC extern void shm_destroy_orphaned(struct ipc_namespace *ns); #else /* CONFIG_SYSVIPC */ static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} #endif /* CONFIG_SYSVIPC */ #ifdef CONFIG_POSIX_MQUEUE extern int mq_init_ns(struct ipc_namespace *ns); /* * POSIX Message Queue default values: * * MIN_*: Lowest value an admin can set the maximum unprivileged limit to * DFLT_*MAX: Default values for the maximum unprivileged limits * DFLT_{MSG,MSGSIZE}: Default values used when the user doesn't supply * an attribute to the open call and the queue must be created * HARD_*: Highest value the maximums can be set to. These are enforced * on CAP_SYS_RESOURCE apps as well making them inviolate (so make them * suitably high) * * POSIX Requirements: * Per app minimum openable message queues - 8. This does not map well * to the fact that we limit the number of queues on a per namespace * basis instead of a per app basis. So, make the default high enough * that no given app should have a hard time opening 8 queues. * Minimum maximum for HARD_MSGMAX - 32767. I bumped this to 65536. * Minimum maximum for HARD_MSGSIZEMAX - POSIX is silent on this. However, * we have run into a situation where running applications in the wild * require this to be at least 5MB, and preferably 10MB, so I set the * value to 16MB in hopes that this user is the worst of the bunch and * the new maximum will handle anyone else. I may have to revisit this * in the future. */ #define DFLT_QUEUESMAX 256 #define MIN_MSGMAX 1 #define DFLT_MSG 10U #define DFLT_MSGMAX 10 #define HARD_MSGMAX 65536 #define MIN_MSGSIZEMAX 128 #define DFLT_MSGSIZE 8192U #define DFLT_MSGSIZEMAX 8192 #define HARD_MSGSIZEMAX (16*1024*1024) #else static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) extern struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) refcount_inc(&ns->count); return ns; } extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) return ERR_PTR(-EINVAL); return ns; } static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { return ns; } static inline void put_ipc_ns(struct ipc_namespace *ns) { } #endif #ifdef CONFIG_POSIX_MQUEUE_SYSCTL struct ctl_table_header; extern struct ctl_table_header *mq_register_sysctl_table(void); #else /* CONFIG_POSIX_MQUEUE_SYSCTL */ static inline struct ctl_table_header *mq_register_sysctl_table(void) { return NULL; } #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */ #endif
3 3 1 1 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 // SPDX-License-Identifier: GPL-2.0-only /* * NSA Security-Enhanced Linux (SELinux) security module * * This file contains the SELinux hook function implementations. * * Authors: Stephen Smalley, <sds@tycho.nsa.gov> * Chris Vance, <cvance@nai.com> * Wayne Salamon, <wsalamon@nai.com> * James Morris <jmorris@redhat.com> * * Copyright (C) 2001,2002 Networks Associates Technology, Inc. * Copyright (C) 2003-2008 Red Hat, Inc., James Morris <jmorris@redhat.com> * Eric Paris <eparis@redhat.com> * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * <dgoeddel@trustedcs.com> * Copyright (C) 2006, 2007, 2009 Hewlett-Packard Development Company, L.P. * Paul Moore <paul@paul-moore.com> * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd. * Yuichi Nakamura <ynakam@hitachisoft.jp> * Copyright (C) 2016 Mellanox Technologies */ #include <linux/init.h> #include <linux/kd.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> #include <linux/tracehook.h> #include <linux/errno.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/lsm_hooks.h> #include <linux/xattr.h> #include <linux/capability.h> #include <linux/unistd.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/swap.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/dcache.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/tty.h> #include <net/icmp.h> #include <net/ip.h> /* for local_port_range[] */ #include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */ #include <net/inet_connection_sock.h> #include <net/net_namespace.h> #include <net/netlabel.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/interrupt.h> #include <linux/netdevice.h> /* for network interface checks */ #include <net/netlink.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <net/sctp/structs.h> #include <linux/quota.h> #include <linux/un.h> /* for Unix socket types */ #include <net/af_unix.h> /* for Unix socket types */ #include <linux/parser.h> #include <linux/nfs_mount.h> #include <net/ipv6.h> #include <linux/hugetlb.h> #include <linux/personality.h> #include <linux/audit.h> #include <linux/string.h> #include <linux/mutex.h> #include <linux/posix-timers.h> #include <linux/syslog.h> #include <linux/user_namespace.h> #include <linux/export.h> #include <linux/msg.h> #include <linux/shm.h> #include <linux/bpf.h> #include <linux/kernfs.h> #include <linux/stringhash.h> /* for hashlen_string() */ #include <uapi/linux/mount.h> #include <linux/fsnotify.h> #include <linux/fanotify.h> #include "avc.h" #include "objsec.h" #include "netif.h" #include "netnode.h" #include "netport.h" #include "ibpkey.h" #include "xfrm.h" #include "netlabel.h" #include "audit.h" #include "avc_ss.h" struct selinux_state selinux_state; /* SECMARK reference count */ static atomic_t selinux_secmark_refcount = ATOMIC_INIT(0); #ifdef CONFIG_SECURITY_SELINUX_DEVELOP static int selinux_enforcing_boot __initdata; static int __init enforcing_setup(char *str) { unsigned long enforcing; if (!kstrtoul(str, 0, &enforcing)) selinux_enforcing_boot = enforcing ? 1 : 0; return 1; } __setup("enforcing=", enforcing_setup); #else #define selinux_enforcing_boot 1 #endif int selinux_enabled_boot __initdata = 1; #ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM static int __init selinux_enabled_setup(char *str) { unsigned long enabled; if (!kstrtoul(str, 0, &enabled)) selinux_enabled_boot = enabled ? 1 : 0; return 1; } __setup("selinux=", selinux_enabled_setup); #endif static unsigned int selinux_checkreqprot_boot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; static int __init checkreqprot_setup(char *str) { unsigned long checkreqprot; if (!kstrtoul(str, 0, &checkreqprot)) { selinux_checkreqprot_boot = checkreqprot ? 1 : 0; if (checkreqprot) pr_warn("SELinux: checkreqprot set to 1 via kernel parameter. This is deprecated and will be rejected in a future kernel release.\n"); } return 1; } __setup("checkreqprot=", checkreqprot_setup); /** * selinux_secmark_enabled - Check to see if SECMARK is currently enabled * * Description: * This function checks the SECMARK reference counter to see if any SECMARK * targets are currently configured, if the reference counter is greater than * zero SECMARK is considered to be enabled. Returns true (1) if SECMARK is * enabled, false (0) if SECMARK is disabled. If the always_check_network * policy capability is enabled, SECMARK is always considered enabled. * */ static int selinux_secmark_enabled(void) { return (selinux_policycap_alwaysnetwork() || atomic_read(&selinux_secmark_refcount)); } /** * selinux_peerlbl_enabled - Check to see if peer labeling is currently enabled * * Description: * This function checks if NetLabel or labeled IPSEC is enabled. Returns true * (1) if any are enabled or false (0) if neither are enabled. If the * always_check_network policy capability is enabled, peer labeling * is always considered enabled. * */ static int selinux_peerlbl_enabled(void) { return (selinux_policycap_alwaysnetwork() || netlbl_enabled() || selinux_xfrm_enabled()); } static int selinux_netcache_avc_callback(u32 event) { if (event == AVC_CALLBACK_RESET) { sel_netif_flush(); sel_netnode_flush(); sel_netport_flush(); synchronize_net(); } return 0; } static int selinux_lsm_notifier_avc_callback(u32 event) { if (event == AVC_CALLBACK_RESET) { sel_ib_pkey_flush(); call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL); } return 0; } /* * initialise the security for the init task */ static void cred_init_security(void) { struct cred *cred = (struct cred *) current->real_cred; struct task_security_struct *tsec; tsec = selinux_cred(cred); tsec->osid = tsec->sid = SECINITSID_KERNEL; } /* * get the security ID of a set of credentials */ static inline u32 cred_sid(const struct cred *cred) { const struct task_security_struct *tsec; tsec = selinux_cred(cred); return tsec->sid; } /* * get the objective security ID of a task */ static inline u32 task_sid(const struct task_struct *task) { u32 sid; rcu_read_lock(); sid = cred_sid(__task_cred(task)); rcu_read_unlock(); return sid; } static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry); /* * Try reloading inode security labels that have been marked as invalid. The * @may_sleep parameter indicates when sleeping and thus reloading labels is * allowed; when set to false, returns -ECHILD when the label is * invalid. The @dentry parameter should be set to a dentry of the inode. */ static int __inode_security_revalidate(struct inode *inode, struct dentry *dentry, bool may_sleep) { struct inode_security_struct *isec = selinux_inode(inode); might_sleep_if(may_sleep); if (selinux_initialized(&selinux_state) && isec->initialized != LABEL_INITIALIZED) { if (!may_sleep) return -ECHILD; /* * Try reloading the inode security label. This will fail if * @opt_dentry is NULL and no dentry for this inode can be * found; in that case, continue using the old label. */ inode_doinit_with_dentry(inode, dentry); } return 0; } static struct inode_security_struct *inode_security_novalidate(struct inode *inode) { return selinux_inode(inode); } static struct inode_security_struct *inode_security_rcu(struct inode *inode, bool rcu) { int error; error = __inode_security_revalidate(inode, NULL, !rcu); if (error) return ERR_PTR(error); return selinux_inode(inode); } /* * Get the security label of an inode. */ static struct inode_security_struct *inode_security(struct inode *inode) { __inode_security_revalidate(inode, NULL, true); return selinux_inode(inode); } static struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); return selinux_inode(inode); } /* * Get the security label of a dentry's backing inode. */ static struct inode_security_struct *backing_inode_security(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); __inode_security_revalidate(inode, dentry, true); return selinux_inode(inode); } static void inode_free_security(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); struct superblock_security_struct *sbsec; if (!isec) return; sbsec = inode->i_sb->s_security; /* * As not all inode security structures are in a list, we check for * empty list outside of the lock to make sure that we won't waste * time taking a lock doing nothing. * * The list_del_init() function can be safely called more than once. * It should not be possible for this function to be called with * concurrent list_add(), but for better safety against future changes * in the code, we use list_empty_careful() here. */ if (!list_empty_careful(&isec->list)) { spin_lock(&sbsec->isec_lock); list_del_init(&isec->list); spin_unlock(&sbsec->isec_lock); } } static void superblock_free_security(struct super_block *sb) { struct superblock_security_struct *sbsec = sb->s_security; sb->s_security = NULL; kfree(sbsec); } struct selinux_mnt_opts { const char *fscontext, *context, *rootcontext, *defcontext; }; static void selinux_free_mnt_opts(void *mnt_opts) { struct selinux_mnt_opts *opts = mnt_opts; kfree(opts->fscontext); kfree(opts->context); kfree(opts->rootcontext); kfree(opts->defcontext); kfree(opts); } enum { Opt_error = -1, Opt_context = 0, Opt_defcontext = 1, Opt_fscontext = 2, Opt_rootcontext = 3, Opt_seclabel = 4, }; #define A(s, has_arg) {#s, sizeof(#s) - 1, Opt_##s, has_arg} static struct { const char *name; int len; int opt; bool has_arg; } tokens[] = { A(context, true), A(fscontext, true), A(defcontext, true), A(rootcontext, true), A(seclabel, false), }; #undef A static int match_opt_prefix(char *s, int l, char **arg) { int i; for (i = 0; i < ARRAY_SIZE(tokens); i++) { size_t len = tokens[i].len; if (len > l || memcmp(s, tokens[i].name, len)) continue; if (tokens[i].has_arg) { if (len == l || s[len] != '=') continue; *arg = s + len + 1; } else if (len != l) continue; return tokens[i].opt; } return Opt_error; } #define SEL_MOUNT_FAIL_MSG "SELinux: duplicate or incompatible mount options\n" static int may_context_mount_sb_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { const struct task_security_struct *tsec = selinux_cred(cred); int rc; rc = avc_has_perm(&selinux_state, tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; rc = avc_has_perm(&selinux_state, tsec->sid, sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELTO, NULL); return rc; } static int may_context_mount_inode_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { const struct task_security_struct *tsec = selinux_cred(cred); int rc; rc = avc_has_perm(&selinux_state, tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, NULL); return rc; } static int selinux_is_genfs_special_handling(struct super_block *sb) { /* Special handling. Genfs but also in-core setxattr handler */ return !strcmp(sb->s_type->name, "sysfs") || !strcmp(sb->s_type->name, "pstore") || !strcmp(sb->s_type->name, "debugfs") || !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "rootfs") || (selinux_policycap_cgroupseclabel() && (!strcmp(sb->s_type->name, "cgroup") || !strcmp(sb->s_type->name, "cgroup2"))); } static int selinux_is_sblabel_mnt(struct super_block *sb) { struct superblock_security_struct *sbsec = sb->s_security; /* * IMPORTANT: Double-check logic in this function when adding a new * SECURITY_FS_USE_* definition! */ BUILD_BUG_ON(SECURITY_FS_USE_MAX != 7); switch (sbsec->behavior) { case SECURITY_FS_USE_XATTR: case SECURITY_FS_USE_TRANS: case SECURITY_FS_USE_TASK: case SECURITY_FS_USE_NATIVE: return 1; case SECURITY_FS_USE_GENFS: return selinux_is_genfs_special_handling(sb); /* Never allow relabeling on context mounts */ case SECURITY_FS_USE_MNTPOINT: case SECURITY_FS_USE_NONE: default: return 0; } } static int sb_finish_set_opts(struct super_block *sb) { struct superblock_security_struct *sbsec = sb->s_security; struct dentry *root = sb->s_root; struct inode *root_inode = d_backing_inode(root); int rc = 0; if (sbsec->behavior == SECURITY_FS_USE_XATTR) { /* Make sure that the xattr handler exists and that no error other than -ENODATA is returned by getxattr on the root directory. -ENODATA is ok, as this may be the first boot of the SELinux kernel before we have assigned xattr values to the filesystem. */ if (!(root_inode->i_opflags & IOP_XATTR)) { pr_warn("SELinux: (dev %s, type %s) has no " "xattr support\n", sb->s_id, sb->s_type->name); rc = -EOPNOTSUPP; goto out; } rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0); if (rc < 0 && rc != -ENODATA) { if (rc == -EOPNOTSUPP) pr_warn("SELinux: (dev %s, type " "%s) has no security xattr handler\n", sb->s_id, sb->s_type->name); else pr_warn("SELinux: (dev %s, type " "%s) getxattr errno %d\n", sb->s_id, sb->s_type->name, -rc); goto out; } } sbsec->flags |= SE_SBINITIALIZED; /* * Explicitly set or clear SBLABEL_MNT. It's not sufficient to simply * leave the flag untouched because sb_clone_mnt_opts might be handing * us a superblock that needs the flag to be cleared. */ if (selinux_is_sblabel_mnt(sb)) sbsec->flags |= SBLABEL_MNT; else sbsec->flags &= ~SBLABEL_MNT; /* Initialize the root inode. */ rc = inode_doinit_with_dentry(root_inode, root); /* Initialize any other inodes associated with the superblock, e.g. inodes created prior to initial policy load or inodes created during get_sb by a pseudo filesystem that directly populates itself. */ spin_lock(&sbsec->isec_lock); while (!list_empty(&sbsec->isec_head)) { struct inode_security_struct *isec = list_first_entry(&sbsec->isec_head, struct inode_security_struct, list); struct inode *inode = isec->inode; list_del_init(&isec->list); spin_unlock(&sbsec->isec_lock); inode = igrab(inode); if (inode) { if (!IS_PRIVATE(inode)) inode_doinit_with_dentry(inode, NULL); iput(inode); } spin_lock(&sbsec->isec_lock); } spin_unlock(&sbsec->isec_lock); out: return rc; } static int bad_option(struct superblock_security_struct *sbsec, char flag, u32 old_sid, u32 new_sid) { char mnt_flags = sbsec->flags & SE_MNTMASK; /* check if the old mount command had the same options */ if (sbsec->flags & SE_SBINITIALIZED) if (!(sbsec->flags & flag) || (old_sid != new_sid)) return 1; /* check if we were passed the same options twice, * aka someone passed context=a,context=b */ if (!(sbsec->flags & SE_SBINITIALIZED)) if (mnt_flags & flag) return 1; return 0; } static int parse_sid(struct super_block *sb, const char *s, u32 *sid) { int rc = security_context_str_to_sid(&selinux_state, s, sid, GFP_KERNEL); if (rc) pr_warn("SELinux: security_context_str_to_sid" "(%s) failed for (dev %s, type %s) errno=%d\n", s, sb->s_id, sb->s_type->name, rc); return rc; } /* * Allow filesystems with binary mount data to explicitly set mount point * labeling information. */ static int selinux_set_mnt_opts(struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) { const struct cred *cred = current_cred(); struct superblock_security_struct *sbsec = sb->s_security; struct dentry *root = sbsec->sb->s_root; struct selinux_mnt_opts *opts = mnt_opts; struct inode_security_struct *root_isec; u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0; u32 defcontext_sid = 0; int rc = 0; mutex_lock(&sbsec->lock); if (!selinux_initialized(&selinux_state)) { if (!opts) { /* Defer initialization until selinux_complete_init, after the initial policy is loaded and the security server is ready to handle calls. */ goto out; } rc = -EINVAL; pr_warn("SELinux: Unable to set superblock options " "before the security server is initialized\n"); goto out; } if (kern_flags && !set_kern_flags) { /* Specifying internal flags without providing a place to * place the results is not allowed */ rc = -EINVAL; goto out; } /* * Binary mount data FS will come through this function twice. Once * from an explicit call and once from the generic calls from the vfs. * Since the generic VFS calls will not contain any security mount data * we need to skip the double mount verification. * * This does open a hole in which we will not notice if the first * mount using this sb set explict options and a second mount using * this sb does not set any security options. (The first options * will be used for both mounts) */ if ((sbsec->flags & SE_SBINITIALIZED) && (sb->s_type->fs_flags & FS_BINARY_MOUNTDATA) && !opts) goto out; root_isec = backing_inode_security_novalidate(root); /* * parse the mount options, check if they are valid sids. * also check if someone is trying to mount the same sb more * than once with different security options. */ if (opts) { if (opts->fscontext) { rc = parse_sid(sb, opts->fscontext, &fscontext_sid); if (rc) goto out; if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, fscontext_sid)) goto out_double_mount; sbsec->flags |= FSCONTEXT_MNT; } if (opts->context) { rc = parse_sid(sb, opts->context, &context_sid); if (rc) goto out; if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, context_sid)) goto out_double_mount; sbsec->flags |= CONTEXT_MNT; } if (opts->rootcontext) { rc = parse_sid(sb, opts->rootcontext, &rootcontext_sid); if (rc) goto out; if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, rootcontext_sid)) goto out_double_mount; sbsec->flags |= ROOTCONTEXT_MNT; } if (opts->defcontext) { rc = parse_sid(sb, opts->defcontext, &defcontext_sid); if (rc) goto out; if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, defcontext_sid)) goto out_double_mount; sbsec->flags |= DEFCONTEXT_MNT; } } if (sbsec->flags & SE_SBINITIALIZED) { /* previously mounted with options, but not on this attempt? */ if ((sbsec->flags & SE_MNTMASK) && !opts) goto out_double_mount; rc = 0; goto out; } if (strcmp(sb->s_type->name, "proc") == 0) sbsec->flags |= SE_SBPROC | SE_SBGENFS; if (!strcmp(sb->s_type->name, "debugfs") || !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "binder") || !strcmp(sb->s_type->name, "bpf") || !strcmp(sb->s_type->name, "pstore")) sbsec->flags |= SE_SBGENFS; if (!strcmp(sb->s_type->name, "sysfs") || !strcmp(sb->s_type->name, "cgroup") || !strcmp(sb->s_type->name, "cgroup2")) sbsec->flags |= SE_SBGENFS | SE_SBGENFS_XATTR; if (!sbsec->behavior) { /* * Determine the labeling behavior to use for this * filesystem type. */ rc = security_fs_use(&selinux_state, sb); if (rc) { pr_warn("%s: security_fs_use(%s) returned %d\n", __func__, sb->s_type->name, rc); goto out; } } /* * If this is a user namespace mount and the filesystem type is not * explicitly whitelisted, then no contexts are allowed on the command * line and security labels must be ignored. */ if (sb->s_user_ns != &init_user_ns && strcmp(sb->s_type->name, "tmpfs") && strcmp(sb->s_type->name, "ramfs") && strcmp(sb->s_type->name, "devpts")) { if (context_sid || fscontext_sid || rootcontext_sid || defcontext_sid) { rc = -EACCES; goto out; } if (sbsec->behavior == SECURITY_FS_USE_XATTR) { sbsec->behavior = SECURITY_FS_USE_MNTPOINT; rc = security_transition_sid(&selinux_state, current_sid(), current_sid(), SECCLASS_FILE, NULL, &sbsec->mntpoint_sid); if (rc) goto out; } goto out_set_opts; } /* sets the context of the superblock for the fs being mounted. */ if (fscontext_sid) { rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred); if (rc) goto out; sbsec->sid = fscontext_sid; } /* * Switch to using mount point labeling behavior. * sets the label used on all file below the mountpoint, and will set * the superblock context if not already set. */ if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !context_sid) { sbsec->behavior = SECURITY_FS_USE_NATIVE; *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; } if (context_sid) { if (!fscontext_sid) { rc = may_context_mount_sb_relabel(context_sid, sbsec, cred); if (rc) goto out; sbsec->sid = context_sid; } else { rc = may_context_mount_inode_relabel(context_sid, sbsec, cred); if (rc) goto out; } if (!rootcontext_sid) rootcontext_sid = context_sid; sbsec->mntpoint_sid = context_sid; sbsec->behavior = SECURITY_FS_USE_MNTPOINT; } if (rootcontext_sid) { rc = may_context_mount_inode_relabel(rootcontext_sid, sbsec, cred); if (rc) goto out; root_isec->sid = rootcontext_sid; root_isec->initialized = LABEL_INITIALIZED; } if (defcontext_sid) { if (sbsec->behavior != SECURITY_FS_USE_XATTR && sbsec->behavior != SECURITY_FS_USE_NATIVE) { rc = -EINVAL; pr_warn("SELinux: defcontext option is " "invalid for this filesystem type\n"); goto out; } if (defcontext_sid != sbsec->def_sid) { rc = may_context_mount_inode_relabel(defcontext_sid, sbsec, cred); if (rc) goto out; } sbsec->def_sid = defcontext_sid; } out_set_opts: rc = sb_finish_set_opts(sb); out: mutex_unlock(&sbsec->lock); return rc; out_double_mount: rc = -EINVAL; pr_warn("SELinux: mount invalid. Same superblock, different " "security settings for (dev %s, type %s)\n", sb->s_id, sb->s_type->name); goto out; } static int selinux_cmp_sb_context(const struct super_block *oldsb, const struct super_block *newsb) { struct superblock_security_struct *old = oldsb->s_security; struct superblock_security_struct *new = newsb->s_security; char oldflags = old->flags & SE_MNTMASK; char newflags = new->flags & SE_MNTMASK; if (oldflags != newflags) goto mismatch; if ((oldflags & FSCONTEXT_MNT) && old->sid != new->sid) goto mismatch; if ((oldflags & CONTEXT_MNT) && old->mntpoint_sid != new->mntpoint_sid) goto mismatch; if ((oldflags & DEFCONTEXT_MNT) && old->def_sid != new->def_sid) goto mismatch; if (oldflags & ROOTCONTEXT_MNT) { struct inode_security_struct *oldroot = backing_inode_security(oldsb->s_root); struct inode_security_struct *newroot = backing_inode_security(newsb->s_root); if (oldroot->sid != newroot->sid) goto mismatch; } return 0; mismatch: pr_warn("SELinux: mount invalid. Same superblock, " "different security settings for (dev %s, " "type %s)\n", newsb->s_id, newsb->s_type->name); return -EBUSY; } static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) { int rc = 0; const struct superblock_security_struct *oldsbsec = oldsb->s_security; struct superblock_security_struct *newsbsec = newsb->s_security; int set_fscontext = (oldsbsec->flags & FSCONTEXT_MNT); int set_context = (oldsbsec->flags & CONTEXT_MNT); int set_rootcontext = (oldsbsec->flags & ROOTCONTEXT_MNT); /* * if the parent was able to be mounted it clearly had no special lsm * mount options. thus we can safely deal with this superblock later */ if (!selinux_initialized(&selinux_state)) return 0; /* * Specifying internal flags without providing a place to * place the results is not allowed. */ if (kern_flags && !set_kern_flags) return -EINVAL; /* how can we clone if the old one wasn't set up?? */ BUG_ON(!(oldsbsec->flags & SE_SBINITIALIZED)); /* if fs is reusing a sb, make sure that the contexts match */ if (newsbsec->flags & SE_SBINITIALIZED) { if ((kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; return selinux_cmp_sb_context(oldsb, newsb); } mutex_lock(&newsbsec->lock); newsbsec->flags = oldsbsec->flags; newsbsec->sid = oldsbsec->sid; newsbsec->def_sid = oldsbsec->def_sid; newsbsec->behavior = oldsbsec->behavior; if (newsbsec->behavior == SECURITY_FS_USE_NATIVE && !(kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) { rc = security_fs_use(&selinux_state, newsb); if (rc) goto out; } if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !set_context) { newsbsec->behavior = SECURITY_FS_USE_NATIVE; *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; } if (set_context) { u32 sid = oldsbsec->mntpoint_sid; if (!set_fscontext) newsbsec->sid = sid; if (!set_rootcontext) { struct inode_security_struct *newisec = backing_inode_security(newsb->s_root); newisec->sid = sid; } newsbsec->mntpoint_sid = sid; } if (set_rootcontext) { const struct inode_security_struct *oldisec = backing_inode_security(oldsb->s_root); struct inode_security_struct *newisec = backing_inode_security(newsb->s_root); newisec->sid = oldisec->sid; } sb_finish_set_opts(newsb); out: mutex_unlock(&newsbsec->lock); return rc; } static int selinux_add_opt(int token, const char *s, void **mnt_opts) { struct selinux_mnt_opts *opts = *mnt_opts; if (token == Opt_seclabel) /* eaten and completely ignored */ return 0; if (!opts) { opts = kzalloc(sizeof(struct selinux_mnt_opts), GFP_KERNEL); if (!opts) return -ENOMEM; *mnt_opts = opts; } if (!s) return -ENOMEM; switch (token) { case Opt_context: if (opts->context || opts->defcontext) goto Einval; opts->context = s; break; case Opt_fscontext: if (opts->fscontext) goto Einval; opts->fscontext = s; break; case Opt_rootcontext: if (opts->rootcontext) goto Einval; opts->rootcontext = s; break; case Opt_defcontext: if (opts->context || opts->defcontext) goto Einval; opts->defcontext = s; break; } return 0; Einval: pr_warn(SEL_MOUNT_FAIL_MSG); return -EINVAL; } static int selinux_add_mnt_opt(const char *option, const char *val, int len, void **mnt_opts) { int token = Opt_error; int rc, i; for (i = 0; i < ARRAY_SIZE(tokens); i++) { if (strcmp(option, tokens[i].name) == 0) { token = tokens[i].opt; break; } } if (token == Opt_error) return -EINVAL; if (token != Opt_seclabel) { val = kmemdup_nul(val, len, GFP_KERNEL); if (!val) { rc = -ENOMEM; goto free_opt; } } rc = selinux_add_opt(token, val, mnt_opts); if (unlikely(rc)) { kfree(val); goto free_opt; } return rc; free_opt: if (*mnt_opts) { selinux_free_mnt_opts(*mnt_opts); *mnt_opts = NULL; } return rc; } static int show_sid(struct seq_file *m, u32 sid) { char *context = NULL; u32 len; int rc; rc = security_sid_to_context(&selinux_state, sid, &context, &len); if (!rc) { bool has_comma = context && strchr(context, ','); seq_putc(m, '='); if (has_comma) seq_putc(m, '\"'); seq_escape(m, context, "\"\n\\"); if (has_comma) seq_putc(m, '\"'); } kfree(context); return rc; } static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb) { struct superblock_security_struct *sbsec = sb->s_security; int rc; if (!(sbsec->flags & SE_SBINITIALIZED)) return 0; if (!selinux_initialized(&selinux_state)) return 0; if (sbsec->flags & FSCONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, FSCONTEXT_STR); rc = show_sid(m, sbsec->sid); if (rc) return rc; } if (sbsec->flags & CONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, CONTEXT_STR); rc = show_sid(m, sbsec->mntpoint_sid); if (rc) return rc; } if (sbsec->flags & DEFCONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, DEFCONTEXT_STR); rc = show_sid(m, sbsec->def_sid); if (rc) return rc; } if (sbsec->flags & ROOTCONTEXT_MNT) { struct dentry *root = sbsec->sb->s_root; struct inode_security_struct *isec = backing_inode_security(root); seq_putc(m, ','); seq_puts(m, ROOTCONTEXT_STR); rc = show_sid(m, isec->sid); if (rc) return rc; } if (sbsec->flags & SBLABEL_MNT) { seq_putc(m, ','); seq_puts(m, SECLABEL_STR); } return 0; } static inline u16 inode_mode_to_security_class(umode_t mode) { switch (mode & S_IFMT) { case S_IFSOCK: return SECCLASS_SOCK_FILE; case S_IFLNK: return SECCLASS_LNK_FILE; case S_IFREG: return SECCLASS_FILE; case S_IFBLK: return SECCLASS_BLK_FILE; case S_IFDIR: return SECCLASS_DIR; case S_IFCHR: return SECCLASS_CHR_FILE; case S_IFIFO: return SECCLASS_FIFO_FILE; } return SECCLASS_FILE; } static inline int default_protocol_stream(int protocol) { return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP); } static inline int default_protocol_dgram(int protocol) { return (protocol == IPPROTO_IP || protocol == IPPROTO_UDP); } static inline u16 socket_type_to_security_class(int family, int type, int protocol) { int extsockclass = selinux_policycap_extsockclass(); switch (family) { case PF_UNIX: switch (type) { case SOCK_STREAM: case SOCK_SEQPACKET: return SECCLASS_UNIX_STREAM_SOCKET; case SOCK_DGRAM: case SOCK_RAW: return SECCLASS_UNIX_DGRAM_SOCKET; } break; case PF_INET: case PF_INET6: switch (type) { case SOCK_STREAM: case SOCK_SEQPACKET: if (default_protocol_stream(protocol)) return SECCLASS_TCP_SOCKET; else if (extsockclass && protocol == IPPROTO_SCTP) return SECCLASS_SCTP_SOCKET; else return SECCLASS_RAWIP_SOCKET; case SOCK_DGRAM: if (default_protocol_dgram(protocol)) return SECCLASS_UDP_SOCKET; else if (extsockclass && (protocol == IPPROTO_ICMP || protocol == IPPROTO_ICMPV6)) return SECCLASS_ICMP_SOCKET; else return SECCLASS_RAWIP_SOCKET; case SOCK_DCCP: return SECCLASS_DCCP_SOCKET; default: return SECCLASS_RAWIP_SOCKET; } break; case PF_NETLINK: switch (protocol) { case NETLINK_ROUTE: return SECCLASS_NETLINK_ROUTE_SOCKET; case NETLINK_SOCK_DIAG: return SECCLASS_NETLINK_TCPDIAG_SOCKET; case NETLINK_NFLOG: return SECCLASS_NETLINK_NFLOG_SOCKET; case NETLINK_XFRM: return SECCLASS_NETLINK_XFRM_SOCKET; case NETLINK_SELINUX: return SECCLASS_NETLINK_SELINUX_SOCKET; case NETLINK_ISCSI: return SECCLASS_NETLINK_ISCSI_SOCKET; case NETLINK_AUDIT: return SECCLASS_NETLINK_AUDIT_SOCKET; case NETLINK_FIB_LOOKUP: return SECCLASS_NETLINK_FIB_LOOKUP_SOCKET; case NETLINK_CONNECTOR: return SECCLASS_NETLINK_CONNECTOR_SOCKET; case NETLINK_NETFILTER: return SECCLASS_NETLINK_NETFILTER_SOCKET; case NETLINK_DNRTMSG: return SECCLASS_NETLINK_DNRT_SOCKET; case NETLINK_KOBJECT_UEVENT: return SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET; case NETLINK_GENERIC: return SECCLASS_NETLINK_GENERIC_SOCKET; case NETLINK_SCSITRANSPORT: return SECCLASS_NETLINK_SCSITRANSPORT_SOCKET; case NETLINK_RDMA: return SECCLASS_NETLINK_RDMA_SOCKET; case NETLINK_CRYPTO: return SECCLASS_NETLINK_CRYPTO_SOCKET; default: return SECCLASS_NETLINK_SOCKET; } case PF_PACKET: return SECCLASS_PACKET_SOCKET; case PF_KEY: return SECCLASS_KEY_SOCKET; case PF_APPLETALK: return SECCLASS_APPLETALK_SOCKET; } if (extsockclass) { switch (family) { case PF_AX25: return SECCLASS_AX25_SOCKET; case PF_IPX: return SECCLASS_IPX_SOCKET; case PF_NETROM: return SECCLASS_NETROM_SOCKET; case PF_ATMPVC: return SECCLASS_ATMPVC_SOCKET; case PF_X25: return SECCLASS_X25_SOCKET; case PF_ROSE: return SECCLASS_ROSE_SOCKET; case PF_DECnet: return SECCLASS_DECNET_SOCKET; case PF_ATMSVC: return SECCLASS_ATMSVC_SOCKET; case PF_RDS: return SECCLASS_RDS_SOCKET; case PF_IRDA: return SECCLASS_IRDA_SOCKET; case PF_PPPOX: return SECCLASS_PPPOX_SOCKET; case PF_LLC: return SECCLASS_LLC_SOCKET; case PF_CAN: return SECCLASS_CAN_SOCKET; case PF_TIPC: return SECCLASS_TIPC_SOCKET; case PF_BLUETOOTH: return SECCLASS_BLUETOOTH_SOCKET; case PF_IUCV: return SECCLASS_IUCV_SOCKET; case PF_RXRPC: return SECCLASS_RXRPC_SOCKET; case PF_ISDN: return SECCLASS_ISDN_SOCKET; case PF_PHONET: return SECCLASS_PHONET_SOCKET; case PF_IEEE802154: return SECCLASS_IEEE802154_SOCKET; case PF_CAIF: return SECCLASS_CAIF_SOCKET; case PF_ALG: return SECCLASS_ALG_SOCKET; case PF_NFC: return SECCLASS_NFC_SOCKET; case PF_VSOCK: return SECCLASS_VSOCK_SOCKET; case PF_KCM: return SECCLASS_KCM_SOCKET; case PF_QIPCRTR: return SECCLASS_QIPCRTR_SOCKET; case PF_SMC: return SECCLASS_SMC_SOCKET; case PF_XDP: return SECCLASS_XDP_SOCKET; #if PF_MAX > 45 #error New address family defined, please update this function. #endif } } return SECCLASS_SOCKET; } static int selinux_genfs_get_sid(struct dentry *dentry, u16 tclass, u16 flags, u32 *sid) { int rc; struct super_block *sb = dentry->d_sb; char *buffer, *path; buffer = (char *)__get_free_page(GFP_KERNEL); if (!buffer) return -ENOMEM; path = dentry_path_raw(dentry, buffer, PAGE_SIZE); if (IS_ERR(path)) rc = PTR_ERR(path); else { if (flags & SE_SBPROC) { /* each process gets a /proc/PID/ entry. Strip off the * PID part to get a valid selinux labeling. * e.g. /proc/1/net/rpc/nfs -> /net/rpc/nfs */ while (path[1] >= '0' && path[1] <= '9') { path[1] = '/'; path++; } } rc = security_genfs_sid(&selinux_state, sb->s_type->name, path, tclass, sid); if (rc == -ENOENT) { /* No match in policy, mark as unlabeled. */ *sid = SECINITSID_UNLABELED; rc = 0; } } free_page((unsigned long)buffer); return rc; } static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry, u32 def_sid, u32 *sid) { #define INITCONTEXTLEN 255 char *context; unsigned int len; int rc; len = INITCONTEXTLEN; context = kmalloc(len + 1, GFP_NOFS); if (!context) return -ENOMEM; context[len] = '\0'; rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len); if (rc == -ERANGE) { kfree(context); /* Need a larger buffer. Query for the right size. */ rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, NULL, 0); if (rc < 0) return rc; len = rc; context = kmalloc(len + 1, GFP_NOFS); if (!context) return -ENOMEM; context[len] = '\0'; rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len); } if (rc < 0) { kfree(context); if (rc != -ENODATA) { pr_warn("SELinux: %s: getxattr returned %d for dev=%s ino=%ld\n", __func__, -rc, inode->i_sb->s_id, inode->i_ino); return rc; } *sid = def_sid; return 0; } rc = security_context_to_sid_default(&selinux_state, context, rc, sid, def_sid, GFP_NOFS); if (rc) { char *dev = inode->i_sb->s_id; unsigned long ino = inode->i_ino; if (rc == -EINVAL) { pr_notice_ratelimited("SELinux: inode=%lu on dev=%s was found to have an invalid context=%s. This indicates you may need to relabel the inode or the filesystem in question.\n", ino, dev, context); } else { pr_warn("SELinux: %s: context_to_sid(%s) returned %d for dev=%s ino=%ld\n", __func__, context, -rc, dev, ino); } } kfree(context); return 0; } /* The inode's security attributes must be initialized before first use. */ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry) { struct superblock_security_struct *sbsec = NULL; struct inode_security_struct *isec = selinux_inode(inode); u32 task_sid, sid = 0; u16 sclass; struct dentry *dentry; int rc = 0; if (isec->initialized == LABEL_INITIALIZED) return 0; spin_lock(&isec->lock); if (isec->initialized == LABEL_INITIALIZED) goto out_unlock; if (isec->sclass == SECCLASS_FILE) isec->sclass = inode_mode_to_security_class(inode->i_mode); sbsec = inode->i_sb->s_security; if (!(sbsec->flags & SE_SBINITIALIZED)) { /* Defer initialization until selinux_complete_init, after the initial policy is loaded and the security server is ready to handle calls. */ spin_lock(&sbsec->isec_lock); if (list_empty(&isec->list)) list_add(&isec->list, &sbsec->isec_head); spin_unlock(&sbsec->isec_lock); goto out_unlock; } sclass = isec->sclass; task_sid = isec->task_sid; sid = isec->sid; isec->initialized = LABEL_PENDING; spin_unlock(&isec->lock); switch (sbsec->behavior) { case SECURITY_FS_USE_NATIVE: break; case SECURITY_FS_USE_XATTR: if (!(inode->i_opflags & IOP_XATTR)) { sid = sbsec->def_sid; break; } /* Need a dentry, since the xattr API requires one. Life would be simpler if we could just pass the inode. */ if (opt_dentry) { /* Called from d_instantiate or d_splice_alias. */ dentry = dget(opt_dentry); } else { /* * Called from selinux_complete_init, try to find a dentry. * Some filesystems really want a connected one, so try * that first. We could split SECURITY_FS_USE_XATTR in * two, depending upon that... */ dentry = d_find_alias(inode); if (!dentry) dentry = d_find_any_alias(inode); } if (!dentry) { /* * this is can be hit on boot when a file is accessed * before the policy is loaded. When we load policy we * may find inodes that have no dentry on the * sbsec->isec_head list. No reason to complain as these * will get fixed up the next time we go through * inode_doinit with a dentry, before these inodes could * be used again by userspace. */ goto out_invalid; } rc = inode_doinit_use_xattr(inode, dentry, sbsec->def_sid, &sid); dput(dentry); if (rc) goto out; break; case SECURITY_FS_USE_TASK: sid = task_sid; break; case SECURITY_FS_USE_TRANS: /* Default to the fs SID. */ sid = sbsec->sid; /* Try to obtain a transition SID. */ rc = security_transition_sid(&selinux_state, task_sid, sid, sclass, NULL, &sid); if (rc) goto out; break; case SECURITY_FS_USE_MNTPOINT: sid = sbsec->mntpoint_sid; break; default: /* Default to the fs superblock SID. */ sid = sbsec->sid; if ((sbsec->flags & SE_SBGENFS) && (!S_ISLNK(inode->i_mode) || selinux_policycap_genfs_seclabel_symlinks())) { /* We must have a dentry to determine the label on * procfs inodes */ if (opt_dentry) { /* Called from d_instantiate or * d_splice_alias. */ dentry = dget(opt_dentry); } else { /* Called from selinux_complete_init, try to * find a dentry. Some filesystems really want * a connected one, so try that first. */ dentry = d_find_alias(inode); if (!dentry) dentry = d_find_any_alias(inode); } /* * This can be hit on boot when a file is accessed * before the policy is loaded. When we load policy we * may find inodes that have no dentry on the * sbsec->isec_head list. No reason to complain as * these will get fixed up the next time we go through * inode_doinit() with a dentry, before these inodes * could be used again by userspace. */ if (!dentry) goto out_invalid; rc = selinux_genfs_get_sid(dentry, sclass, sbsec->flags, &sid); if (rc) { dput(dentry); goto out; } if ((sbsec->flags & SE_SBGENFS_XATTR) && (inode->i_opflags & IOP_XATTR)) { rc = inode_doinit_use_xattr(inode, dentry, sid, &sid); if (rc) { dput(dentry); goto out; } } dput(dentry); } break; } out: spin_lock(&isec->lock); if (isec->initialized == LABEL_PENDING) { if (rc) { isec->initialized = LABEL_INVALID; goto out_unlock; } isec->initialized = LABEL_INITIALIZED; isec->sid = sid; } out_unlock: spin_unlock(&isec->lock); return rc; out_invalid: spin_lock(&isec->lock); if (isec->initialized == LABEL_PENDING) { isec->initialized = LABEL_INVALID; isec->sid = sid; } spin_unlock(&isec->lock); return 0; } /* Convert a Linux signal to an access vector. */ static inline u32 signal_to_av(int sig) { u32 perm = 0; switch (sig) { case SIGCHLD: /* Commonly granted from child to parent. */ perm = PROCESS__SIGCHLD; break; case SIGKILL: /* Cannot be caught or ignored */ perm = PROCESS__SIGKILL; break; case SIGSTOP: /* Cannot be caught or ignored */ perm = PROCESS__SIGSTOP; break; default: /* All other signals. */ perm = PROCESS__SIGNAL; break; } return perm; } #if CAP_LAST_CAP > 63 #error Fix SELinux to handle capabilities > 63. #endif /* Check whether a task is allowed to use a capability. */ static int cred_has_capability(const struct cred *cred, int cap, unsigned int opts, bool initns) { struct common_audit_data ad; struct av_decision avd; u16 sclass; u32 sid = cred_sid(cred); u32 av = CAP_TO_MASK(cap); int rc; ad.type = LSM_AUDIT_DATA_CAP; ad.u.cap = cap; switch (CAP_TO_INDEX(cap)) { case 0: sclass = initns ? SECCLASS_CAPABILITY : SECCLASS_CAP_USERNS; break; case 1: sclass = initns ? SECCLASS_CAPABILITY2 : SECCLASS_CAP2_USERNS; break; default: pr_err("SELinux: out of range capability %d\n", cap); BUG(); return -EINVAL; } rc = avc_has_perm_noaudit(&selinux_state, sid, sid, sclass, av, 0, &avd); if (!(opts & CAP_OPT_NOAUDIT)) { int rc2 = avc_audit(&selinux_state, sid, sid, sclass, av, &avd, rc, &ad, 0); if (rc2) return rc2; } return rc; } /* Check whether a task has a particular permission to an inode. The 'adp' parameter is optional and allows other audit data to be passed (e.g. the dentry). */ static int inode_has_perm(const struct cred *cred, struct inode *inode, u32 perms, struct common_audit_data *adp) { struct inode_security_struct *isec; u32 sid; validate_creds(cred); if (unlikely(IS_PRIVATE(inode))) return 0; sid = cred_sid(cred); isec = selinux_inode(inode); return avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, perms, adp); } /* Same as inode_has_perm, but pass explicit audit data containing the dentry to help the auditing code to more easily generate the pathname if needed. */ static inline int dentry_has_perm(const struct cred *cred, struct dentry *dentry, u32 av) { struct inode *inode = d_backing_inode(dentry); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; __inode_security_revalidate(inode, dentry, true); return inode_has_perm(cred, inode, av, &ad); } /* Same as inode_has_perm, but pass explicit audit data containing the path to help the auditing code to more easily generate the pathname if needed. */ static inline int path_has_perm(const struct cred *cred, const struct path *path, u32 av) { struct inode *inode = d_backing_inode(path->dentry); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = *path; __inode_security_revalidate(inode, path->dentry, true); return inode_has_perm(cred, inode, av, &ad); } /* Same as path_has_perm, but uses the inode from the file struct. */ static inline int file_path_has_perm(const struct cred *cred, struct file *file, u32 av) { struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; return inode_has_perm(cred, file_inode(file), av, &ad); } #ifdef CONFIG_BPF_SYSCALL static int bpf_fd_pass(struct file *file, u32 sid); #endif /* Check whether a task can use an open file descriptor to access an inode in a given way. Check access to the descriptor itself, and then use dentry_has_perm to check a particular permission to the file. Access to the descriptor is implicitly granted if it has the same SID as the process. If av is zero, then access to the file is not checked, e.g. for cases where only the descriptor is affected like seek. */ static int file_has_perm(const struct cred *cred, struct file *file, u32 av) { struct file_security_struct *fsec = selinux_file(file); struct inode *inode = file_inode(file); struct common_audit_data ad; u32 sid = cred_sid(cred); int rc; ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; if (sid != fsec->sid) { rc = avc_has_perm(&selinux_state, sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) goto out; } #ifdef CONFIG_BPF_SYSCALL rc = bpf_fd_pass(file, cred_sid(cred)); if (rc) return rc; #endif /* av is zero if only checking access to the descriptor. */ rc = 0; if (av) rc = inode_has_perm(cred, inode, av, &ad); out: return rc; } /* * Determine the label for an inode that might be unioned. */ static int selinux_determine_inode_label(const struct task_security_struct *tsec, struct inode *dir, const struct qstr *name, u16 tclass, u32 *_new_isid) { const struct superblock_security_struct *sbsec = dir->i_sb->s_security; if ((sbsec->flags & SE_SBINITIALIZED) && (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) { *_new_isid = sbsec->mntpoint_sid; } else if ((sbsec->flags & SBLABEL_MNT) && tsec->create_sid) { *_new_isid = tsec->create_sid; } else { const struct inode_security_struct *dsec = inode_security(dir); return security_transition_sid(&selinux_state, tsec->sid, dsec->sid, tclass, name, _new_isid); } return 0; } /* Check whether a task can create a file. */ static int may_create(struct inode *dir, struct dentry *dentry, u16 tclass) { const struct task_security_struct *tsec = selinux_cred(current_cred()); struct inode_security_struct *dsec; struct superblock_security_struct *sbsec; u32 sid, newsid; struct common_audit_data ad; int rc; dsec = inode_security(dir); sbsec = dir->i_sb->s_security; sid = tsec->sid; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; rc = avc_has_perm(&selinux_state, sid, dsec->sid, SECCLASS_DIR, DIR__ADD_NAME | DIR__SEARCH, &ad); if (rc) return rc; rc = selinux_determine_inode_label(tsec, dir, &dentry->d_name, tclass, &newsid); if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, newsid, tclass, FILE__CREATE, &ad); if (rc) return rc; return avc_has_perm(&selinux_state, newsid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, &ad); } #define MAY_LINK 0 #define MAY_UNLINK 1 #define MAY_RMDIR 2 /* Check whether a task can link, unlink, or rmdir a file/directory. */ static int may_link(struct inode *dir, struct dentry *dentry, int kind) { struct inode_security_struct *dsec, *isec; struct common_audit_data ad; u32 sid = current_sid(); u32 av; int rc; dsec = inode_security(dir); isec = backing_inode_security(dentry); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; av = DIR__SEARCH; av |= (kind ? DIR__REMOVE_NAME : DIR__ADD_NAME); rc = avc_has_perm(&selinux_state, sid, dsec->sid, SECCLASS_DIR, av, &ad); if (rc) return rc; switch (kind) { case MAY_LINK: av = FILE__LINK; break; case MAY_UNLINK: av = FILE__UNLINK; break; case MAY_RMDIR: av = DIR__RMDIR; break; default: pr_warn("SELinux: %s: unrecognized kind %d\n", __func__, kind); return 0; } rc = avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, av, &ad); return rc; } static inline int may_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode_security_struct *old_dsec, *new_dsec, *old_isec, *new_isec; struct common_audit_data ad; u32 sid = current_sid(); u32 av; int old_is_dir, new_is_dir; int rc; old_dsec = inode_security(old_dir); old_isec = backing_inode_security(old_dentry); old_is_dir = d_is_dir(old_dentry); new_dsec = inode_security(new_dir); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = old_dentry; rc = avc_has_perm(&selinux_state, sid, old_dsec->sid, SECCLASS_DIR, DIR__REMOVE_NAME | DIR__SEARCH, &ad); if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, old_isec->sid, old_isec->sclass, FILE__RENAME, &ad); if (rc) return rc; if (old_is_dir && new_dir != old_dir) { rc = avc_has_perm(&selinux_state, sid, old_isec->sid, old_isec->sclass, DIR__REPARENT, &ad); if (rc) return rc; } ad.u.dentry = new_dentry; av = DIR__ADD_NAME | DIR__SEARCH; if (d_is_positive(new_dentry)) av |= DIR__REMOVE_NAME; rc = avc_has_perm(&selinux_state, sid, new_dsec->sid, SECCLASS_DIR, av, &ad); if (rc) return rc; if (d_is_positive(new_dentry)) { new_isec = backing_inode_security(new_dentry); new_is_dir = d_is_dir(new_dentry); rc = avc_has_perm(&selinux_state, sid, new_isec->sid, new_isec->sclass, (new_is_dir ? DIR__RMDIR : FILE__UNLINK), &ad); if (rc) return rc; } return 0; } /* Check whether a task can perform a filesystem operation. */ static int superblock_has_perm(const struct cred *cred, struct super_block *sb, u32 perms, struct common_audit_data *ad) { struct superblock_security_struct *sbsec; u32 sid = cred_sid(cred); sbsec = sb->s_security; return avc_has_perm(&selinux_state, sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad); } /* Convert a Linux mode and permission mask to an access vector. */ static inline u32 file_mask_to_av(int mode, int mask) { u32 av = 0; if (!S_ISDIR(mode)) { if (mask & MAY_EXEC) av |= FILE__EXECUTE; if (mask & MAY_READ) av |= FILE__READ; if (mask & MAY_APPEND) av |= FILE__APPEND; else if (mask & MAY_WRITE) av |= FILE__WRITE; } else { if (mask & MAY_EXEC) av |= DIR__SEARCH; if (mask & MAY_WRITE) av |= DIR__WRITE; if (mask & MAY_READ) av |= DIR__READ; } return av; } /* Convert a Linux file to an access vector. */ static inline u32 file_to_av(struct file *file) { u32 av = 0; if (file->f_mode & FMODE_READ) av |= FILE__READ; if (file->f_mode & FMODE_WRITE) { if (file->f_flags & O_APPEND) av |= FILE__APPEND; else av |= FILE__WRITE; } if (!av) { /* * Special file opened with flags 3 for ioctl-only use. */ av = FILE__IOCTL; } return av; } /* * Convert a file to an access vector and include the correct * open permission. */ static inline u32 open_file_to_av(struct file *file) { u32 av = file_to_av(file); struct inode *inode = file_inode(file); if (selinux_policycap_openperm() && inode->i_sb->s_magic != SOCKFS_MAGIC) av |= FILE__OPEN; return av; } /* Hook functions begin here. */ static int selinux_binder_set_context_mgr(struct task_struct *mgr) { u32 mysid = current_sid(); u32 mgrsid = task_sid(mgr); return avc_has_perm(&selinux_state, mysid, mgrsid, SECCLASS_BINDER, BINDER__SET_CONTEXT_MGR, NULL); } static int selinux_binder_transaction(struct task_struct *from, struct task_struct *to) { u32 mysid = current_sid(); u32 fromsid = task_sid(from); u32 tosid = task_sid(to); int rc; if (mysid != fromsid) { rc = avc_has_perm(&selinux_state, mysid, fromsid, SECCLASS_BINDER, BINDER__IMPERSONATE, NULL); if (rc) return rc; } return avc_has_perm(&selinux_state, fromsid, tosid, SECCLASS_BINDER, BINDER__CALL, NULL); } static int selinux_binder_transfer_binder(struct task_struct *from, struct task_struct *to) { u32 fromsid = task_sid(from); u32 tosid = task_sid(to); return avc_has_perm(&selinux_state, fromsid, tosid, SECCLASS_BINDER, BINDER__TRANSFER, NULL); } static int selinux_binder_transfer_file(struct task_struct *from, struct task_struct *to, struct file *file) { u32 sid = task_sid(to); struct file_security_struct *fsec = selinux_file(file); struct dentry *dentry = file->f_path.dentry; struct inode_security_struct *isec; struct common_audit_data ad; int rc; ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = file->f_path; if (sid != fsec->sid) { rc = avc_has_perm(&selinux_state, sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) return rc; } #ifdef CONFIG_BPF_SYSCALL rc = bpf_fd_pass(file, sid); if (rc) return rc; #endif if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; isec = backing_inode_security(dentry); return avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, file_to_av(file), &ad); } static int selinux_ptrace_access_check(struct task_struct *child, unsigned int mode) { u32 sid = current_sid(); u32 csid = task_sid(child); if (mode & PTRACE_MODE_READ) return avc_has_perm(&selinux_state, sid, csid, SECCLASS_FILE, FILE__READ, NULL); return avc_has_perm(&selinux_state, sid, csid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); } static int selinux_ptrace_traceme(struct task_struct *parent) { return avc_has_perm(&selinux_state, task_sid(parent), current_sid(), SECCLASS_PROCESS, PROCESS__PTRACE, NULL); } static int selinux_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { return avc_has_perm(&selinux_state, current_sid(), task_sid(target), SECCLASS_PROCESS, PROCESS__GETCAP, NULL); } static int selinux_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { return avc_has_perm(&selinux_state, cred_sid(old), cred_sid(new), SECCLASS_PROCESS, PROCESS__SETCAP, NULL); } /* * (This comment used to live with the selinux_task_setuid hook, * which was removed). * * Since setuid only affects the current process, and since the SELinux * controls are not based on the Linux identity attributes, SELinux does not * need to control this operation. However, SELinux does control the use of * the CAP_SETUID and CAP_SETGID capabilities using the capable hook. */ static int selinux_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { return cred_has_capability(cred, cap, opts, ns == &init_user_ns); } static int selinux_quotactl(int cmds, int type, int id, struct super_block *sb) { const struct cred *cred = current_cred(); int rc = 0; if (!sb) return 0; switch (cmds) { case Q_SYNC: case Q_QUOTAON: case Q_QUOTAOFF: case Q_SETINFO: case Q_SETQUOTA: case Q_XQUOTAOFF: case Q_XQUOTAON: case Q_XSETQLIM: rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAMOD, NULL); break; case Q_GETFMT: case Q_GETINFO: case Q_GETQUOTA: case Q_XGETQUOTA: case Q_XGETQSTAT: case Q_XGETQSTATV: case Q_XGETNEXTQUOTA: rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAGET, NULL); break; default: rc = 0; /* let the kernel handle invalid cmds */ break; } return rc; } static int selinux_quota_on(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__QUOTAON); } static int selinux_syslog(int type) { switch (type) { case SYSLOG_ACTION_READ_ALL: /* Read last kernel messages */ case SYSLOG_ACTION_SIZE_BUFFER: /* Return size of the log buffer */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_READ, NULL); case SYSLOG_ACTION_CONSOLE_OFF: /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: /* Enable logging to console */ /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_CONSOLE, NULL); } /* All other syslog types */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_MOD, NULL); } /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * Do not audit the selinux permission check, as this is applied to all * processes that allocate mappings. */ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages) { int rc, cap_sys_admin = 0; rc = cred_has_capability(current_cred(), CAP_SYS_ADMIN, CAP_OPT_NOAUDIT, true); if (rc == 0) cap_sys_admin = 1; return cap_sys_admin; } /* binprm security operations */ static u32 ptrace_parent_sid(void) { u32 sid = 0; struct task_struct *tracer; rcu_read_lock(); tracer = ptrace_parent(current); if (tracer) sid = task_sid(tracer); rcu_read_unlock(); return sid; } static int check_nnp_nosuid(const struct linux_binprm *bprm, const struct task_security_struct *old_tsec, const struct task_security_struct *new_tsec) { int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS); int nosuid = !mnt_may_suid(bprm->file->f_path.mnt); int rc; u32 av; if (!nnp && !nosuid) return 0; /* neither NNP nor nosuid */ if (new_tsec->sid == old_tsec->sid) return 0; /* No change in credentials */ /* * If the policy enables the nnp_nosuid_transition policy capability, * then we permit transitions under NNP or nosuid if the * policy allows the corresponding permission between * the old and new contexts. */ if (selinux_policycap_nnp_nosuid_transition()) { av = 0; if (nnp) av |= PROCESS2__NNP_TRANSITION; if (nosuid) av |= PROCESS2__NOSUID_TRANSITION; rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS2, av, NULL); if (!rc) return 0; } /* * We also permit NNP or nosuid transitions to bounded SIDs, * i.e. SIDs that are guaranteed to only be allowed a subset * of the permissions of the current SID. */ rc = security_bounded_transition(&selinux_state, old_tsec->sid, new_tsec->sid); if (!rc) return 0; /* * On failure, preserve the errno values for NNP vs nosuid. * NNP: Operation not permitted for caller. * nosuid: Permission denied to file. */ if (nnp) return -EPERM; return -EACCES; } static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) { const struct task_security_struct *old_tsec; struct task_security_struct *new_tsec; struct inode_security_struct *isec; struct common_audit_data ad; struct inode *inode = file_inode(bprm->file); int rc; /* SELinux context only depends on initial program or script and not * the script interpreter */ old_tsec = selinux_cred(current_cred()); new_tsec = selinux_cred(bprm->cred); isec = inode_security(inode); /* Default to the current task SID. */ new_tsec->sid = old_tsec->sid; new_tsec->osid = old_tsec->sid; /* Reset fs, key, and sock SIDs on execve. */ new_tsec->create_sid = 0; new_tsec->keycreate_sid = 0; new_tsec->sockcreate_sid = 0; if (old_tsec->exec_sid) { new_tsec->sid = old_tsec->exec_sid; /* Reset exec SID on execve. */ new_tsec->exec_sid = 0; /* Fail on NNP or nosuid if not an allowed transition. */ rc = check_nnp_nosuid(bprm, old_tsec, new_tsec); if (rc) return rc; } else { /* Check for a default transition on this program. */ rc = security_transition_sid(&selinux_state, old_tsec->sid, isec->sid, SECCLASS_PROCESS, NULL, &new_tsec->sid); if (rc) return rc; /* * Fallback to old SID on NNP or nosuid if not an allowed * transition. */ rc = check_nnp_nosuid(bprm, old_tsec, new_tsec); if (rc) new_tsec->sid = old_tsec->sid; } ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = bprm->file; if (new_tsec->sid == old_tsec->sid) { rc = avc_has_perm(&selinux_state, old_tsec->sid, isec->sid, SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad); if (rc) return rc; } else { /* Check permissions for the transition. */ rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__TRANSITION, &ad); if (rc) return rc; rc = avc_has_perm(&selinux_state, new_tsec->sid, isec->sid, SECCLASS_FILE, FILE__ENTRYPOINT, &ad); if (rc) return rc; /* Check for shared state */ if (bprm->unsafe & LSM_UNSAFE_SHARE) { rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__SHARE, NULL); if (rc) return -EPERM; } /* Make sure that anyone attempting to ptrace over a task that * changes its SID has the appropriate permit */ if (bprm->unsafe & LSM_UNSAFE_PTRACE) { u32 ptsid = ptrace_parent_sid(); if (ptsid != 0) { rc = avc_has_perm(&selinux_state, ptsid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); if (rc) return -EPERM; } } /* Clear any possibly unsafe personality bits on exec: */ bprm->per_clear |= PER_CLEAR_ON_SETID; /* Enable secure mode for SIDs transitions unless the noatsecure permission is granted between the two SIDs, i.e. ahp returns 0. */ rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__NOATSECURE, NULL); bprm->secureexec |= !!rc; } return 0; } static int match_file(const void *p, struct file *file, unsigned fd) { return file_has_perm(p, file, file_to_av(file)) ? fd + 1 : 0; } /* Derived from fs/exec.c:flush_old_files. */ static inline void flush_unauthorized_files(const struct cred *cred, struct files_struct *files) { struct file *file, *devnull = NULL; struct tty_struct *tty; int drop_tty = 0; unsigned n; tty = get_current_tty(); if (tty) { spin_lock(&tty->files_lock); if (!list_empty(&tty->tty_files)) { struct tty_file_private *file_priv; /* Revalidate access to controlling tty. Use file_path_has_perm on the tty path directly rather than using file_has_perm, as this particular open file may belong to another process and we are only interested in the inode-based check here. */ file_priv = list_first_entry(&tty->tty_files, struct tty_file_private, list); file = file_priv->file; if (file_path_has_perm(cred, file, FILE__READ | FILE__WRITE)) drop_tty = 1; } spin_unlock(&tty->files_lock); tty_kref_put(tty); } /* Reset controlling tty. */ if (drop_tty) no_tty(); /* Revalidate access to inherited open files. */ n = iterate_fd(files, 0, match_file, cred); if (!n) /* none found? */ return; devnull = dentry_open(&selinux_null, O_RDWR, cred); if (IS_ERR(devnull)) devnull = NULL; /* replace all the matching ones with this */ do { replace_fd(n - 1, devnull, 0); } while ((n = iterate_fd(files, n, match_file, cred)) != 0); if (devnull) fput(devnull); } /* * Prepare a process for imminent new credential changes due to exec */ static void selinux_bprm_committing_creds(struct linux_binprm *bprm) { struct task_security_struct *new_tsec; struct rlimit *rlim, *initrlim; int rc, i; new_tsec = selinux_cred(bprm->cred); if (new_tsec->sid == new_tsec->osid) return; /* Close files for which the new task SID is not authorized. */ flush_unauthorized_files(bprm->cred, current->files); /* Always clear parent death signal on SID transitions. */ current->pdeath_signal = 0; /* Check whether the new SID can inherit resource limits from the old * SID. If not, reset all soft limits to the lower of the current * task's hard limit and the init task's soft limit. * * Note that the setting of hard limits (even to lower them) can be * controlled by the setrlimit check. The inclusion of the init task's * soft limit into the computation is to avoid resetting soft limits * higher than the default soft limit for cases where the default is * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK. */ rc = avc_has_perm(&selinux_state, new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__RLIMITINH, NULL); if (rc) { /* protect against do_prlimit() */ task_lock(current); for (i = 0; i < RLIM_NLIMITS; i++) { rlim = current->signal->rlim + i; initrlim = init_task.signal->rlim + i; rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); } task_unlock(current); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) update_rlimit_cpu(current, rlimit(RLIMIT_CPU)); } } /* * Clean up the process immediately after the installation of new credentials * due to exec */ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) { const struct task_security_struct *tsec = selinux_cred(current_cred()); u32 osid, sid; int rc; osid = tsec->osid; sid = tsec->sid; if (sid == osid) return; /* Check whether the new SID can inherit signal state from the old SID. * If not, clear itimers to avoid subsequent signal generation and * flush and unblock signals. * * This must occur _after_ the task SID has been updated so that any * kill done after the flush will be checked against the new SID. */ rc = avc_has_perm(&selinux_state, osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL); if (rc) { clear_itimer(); spin_lock_irq(&current->sighand->siglock); if (!fatal_signal_pending(current)) { flush_sigqueue(&current->pending); flush_sigqueue(&current->signal->shared_pending); flush_signal_handlers(current, 1); sigemptyset(&current->blocked); recalc_sigpending(); } spin_unlock_irq(&current->sighand->siglock); } /* Wake up the parent if it is waiting so that it can recheck * wait permission to the new task SID. */ read_lock(&tasklist_lock); __wake_up_parent(current, current->real_parent); read_unlock(&tasklist_lock); } /* superblock security operations */ static int selinux_sb_alloc_security(struct super_block *sb) { struct superblock_security_struct *sbsec; sbsec = kzalloc(sizeof(struct superblock_security_struct), GFP_KERNEL); if (!sbsec) return -ENOMEM; mutex_init(&sbsec->lock); INIT_LIST_HEAD(&sbsec->isec_head); spin_lock_init(&sbsec->isec_lock); sbsec->sb = sb; sbsec->sid = SECINITSID_UNLABELED; sbsec->def_sid = SECINITSID_FILE; sbsec->mntpoint_sid = SECINITSID_UNLABELED; sb->s_security = sbsec; return 0; } static void selinux_sb_free_security(struct super_block *sb) { superblock_free_security(sb); } static inline int opt_len(const char *s) { bool open_quote = false; int len; char c; for (len = 0; (c = s[len]) != '\0'; len++) { if (c == '"') open_quote = !open_quote; if (c == ',' && !open_quote) break; } return len; } static int selinux_sb_eat_lsm_opts(char *options, void **mnt_opts) { char *from = options; char *to = options; bool first = true; int rc; while (1) { int len = opt_len(from); int token; char *arg = NULL; token = match_opt_prefix(from, len, &arg); if (token != Opt_error) { char *p, *q; /* strip quotes */ if (arg) { for (p = q = arg; p < from + len; p++) { char c = *p; if (c != '"') *q++ = c; } arg = kmemdup_nul(arg, q - arg, GFP_KERNEL); if (!arg) { rc = -ENOMEM; goto free_opt; } } rc = selinux_add_opt(token, arg, mnt_opts); if (unlikely(rc)) { kfree(arg); goto free_opt; } } else { if (!first) { // copy with preceding comma from--; len++; } if (to != from) memmove(to, from, len); to += len; first = false; } if (!from[len]) break; from += len + 1; } *to = '\0'; return 0; free_opt: if (*mnt_opts) { selinux_free_mnt_opts(*mnt_opts); *mnt_opts = NULL; } return rc; } static int selinux_sb_remount(struct super_block *sb, void *mnt_opts) { struct selinux_mnt_opts *opts = mnt_opts; struct superblock_security_struct *sbsec = sb->s_security; u32 sid; int rc; if (!(sbsec->flags & SE_SBINITIALIZED)) return 0; if (!opts) return 0; if (opts->fscontext) { rc = parse_sid(sb, opts->fscontext, &sid); if (rc) return rc; if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, sid)) goto out_bad_option; } if (opts->context) { rc = parse_sid(sb, opts->context, &sid); if (rc) return rc; if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, sid)) goto out_bad_option; } if (opts->rootcontext) { struct inode_security_struct *root_isec; root_isec = backing_inode_security(sb->s_root); rc = parse_sid(sb, opts->rootcontext, &sid); if (rc) return rc; if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, sid)) goto out_bad_option; } if (opts->defcontext) { rc = parse_sid(sb, opts->defcontext, &sid); if (rc) return rc; if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, sid)) goto out_bad_option; } return 0; out_bad_option: pr_warn("SELinux: unable to change security options " "during remount (dev %s, type=%s)\n", sb->s_id, sb->s_type->name); return -EINVAL; } static int selinux_sb_kern_mount(struct super_block *sb) { const struct cred *cred = current_cred(); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = sb->s_root; return superblock_has_perm(cred, sb, FILESYSTEM__MOUNT, &ad); } static int selinux_sb_statfs(struct dentry *dentry) { const struct cred *cred = current_cred(); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry->d_sb->s_root; return superblock_has_perm(cred, dentry->d_sb, FILESYSTEM__GETATTR, &ad); } static int selinux_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { const struct cred *cred = current_cred(); if (flags & MS_REMOUNT) return superblock_has_perm(cred, path->dentry->d_sb, FILESYSTEM__REMOUNT, NULL); else return path_has_perm(cred, path, FILE__MOUNTON); } static int selinux_move_mount(const struct path *from_path, const struct path *to_path) { const struct cred *cred = current_cred(); return path_has_perm(cred, to_path, FILE__MOUNTON); } static int selinux_umount(struct vfsmount *mnt, int flags) { const struct cred *cred = current_cred(); return superblock_has_perm(cred, mnt->mnt_sb, FILESYSTEM__UNMOUNT, NULL); } static int selinux_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { const struct selinux_mnt_opts *src = src_fc->security; struct selinux_mnt_opts *opts; if (!src) return 0; fc->security = kzalloc(sizeof(struct selinux_mnt_opts), GFP_KERNEL); if (!fc->security) return -ENOMEM; opts = fc->security; if (src->fscontext) { opts->fscontext = kstrdup(src->fscontext, GFP_KERNEL); if (!opts->fscontext) return -ENOMEM; } if (src->context) { opts->context = kstrdup(src->context, GFP_KERNEL); if (!opts->context) return -ENOMEM; } if (src->rootcontext) { opts->rootcontext = kstrdup(src->rootcontext, GFP_KERNEL); if (!opts->rootcontext) return -ENOMEM; } if (src->defcontext) { opts->defcontext = kstrdup(src->defcontext, GFP_KERNEL); if (!opts->defcontext) return -ENOMEM; } return 0; } static const struct fs_parameter_spec selinux_fs_parameters[] = { fsparam_string(CONTEXT_STR, Opt_context), fsparam_string(DEFCONTEXT_STR, Opt_defcontext), fsparam_string(FSCONTEXT_STR, Opt_fscontext), fsparam_string(ROOTCONTEXT_STR, Opt_rootcontext), fsparam_flag (SECLABEL_STR, Opt_seclabel), {} }; static int selinux_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; int opt, rc; opt = fs_parse(fc, selinux_fs_parameters, param, &result); if (opt < 0) return opt; rc = selinux_add_opt(opt, param->string, &fc->security); if (!rc) { param->string = NULL; rc = 1; } return rc; } /* inode security operations */ static int selinux_inode_alloc_security(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); u32 sid = current_sid(); spin_lock_init(&isec->lock); INIT_LIST_HEAD(&isec->list); isec->inode = inode; isec->sid = SECINITSID_UNLABELED; isec->sclass = SECCLASS_FILE; isec->task_sid = sid; isec->initialized = LABEL_INVALID; return 0; } static void selinux_inode_free_security(struct inode *inode) { inode_free_security(inode); } static int selinux_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, void **ctx, u32 *ctxlen) { u32 newsid; int rc; rc = selinux_determine_inode_label(selinux_cred(current_cred()), d_inode(dentry->d_parent), name, inode_mode_to_security_class(mode), &newsid); if (rc) return rc; return security_sid_to_context(&selinux_state, newsid, (char **)ctx, ctxlen); } static int selinux_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) { u32 newsid; int rc; struct task_security_struct *tsec; rc = selinux_determine_inode_label(selinux_cred(old), d_inode(dentry->d_parent), name, inode_mode_to_security_class(mode), &newsid); if (rc) return rc; tsec = selinux_cred(new); tsec->create_sid = newsid; return 0; } static int selinux_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, const char **name, void **value, size_t *len) { const struct task_security_struct *tsec = selinux_cred(current_cred()); struct superblock_security_struct *sbsec; u32 newsid, clen; int rc; char *context; sbsec = dir->i_sb->s_security; newsid = tsec->create_sid; rc = selinux_determine_inode_label(tsec, dir, qstr, inode_mode_to_security_class(inode->i_mode), &newsid); if (rc) return rc; /* Possibly defer initialization to selinux_complete_init. */ if (sbsec->flags & SE_SBINITIALIZED) { struct inode_security_struct *isec = selinux_inode(inode); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; } if (!selinux_initialized(&selinux_state) || !(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (name) *name = XATTR_SELINUX_SUFFIX; if (value && len) { rc = security_sid_to_context_force(&selinux_state, newsid, &context, &clen); if (rc) return rc; *value = context; *len = clen; } return 0; } static int selinux_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode) { return may_create(dir, dentry, SECCLASS_FILE); } static int selinux_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { return may_link(dir, old_dentry, MAY_LINK); } static int selinux_inode_unlink(struct inode *dir, struct dentry *dentry) { return may_link(dir, dentry, MAY_UNLINK); } static int selinux_inode_symlink(struct inode *dir, struct dentry *dentry, const char *name) { return may_create(dir, dentry, SECCLASS_LNK_FILE); } static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mask) { return may_create(dir, dentry, SECCLASS_DIR); } static int selinux_inode_rmdir(struct inode *dir, struct dentry *dentry) { return may_link(dir, dentry, MAY_RMDIR); } static int selinux_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { return may_create(dir, dentry, inode_mode_to_security_class(mode)); } static int selinux_inode_rename(struct inode *old_inode, struct dentry *old_dentry, struct inode *new_inode, struct dentry *new_dentry) { return may_rename(old_inode, old_dentry, new_inode, new_dentry); } static int selinux_inode_readlink(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__READ); } static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) { const struct cred *cred = current_cred(); struct common_audit_data ad; struct inode_security_struct *isec; u32 sid; validate_creds(cred); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; sid = cred_sid(cred); isec = inode_security_rcu(inode, rcu); if (IS_ERR(isec)) return PTR_ERR(isec); return avc_has_perm_flags(&selinux_state, sid, isec->sid, isec->sclass, FILE__READ, &ad, rcu ? MAY_NOT_BLOCK : 0); } static noinline int audit_inode_permission(struct inode *inode, u32 perms, u32 audited, u32 denied, int result) { struct common_audit_data ad; struct inode_security_struct *isec = selinux_inode(inode); int rc; ad.type = LSM_AUDIT_DATA_INODE; ad.u.inode = inode; rc = slow_avc_audit(&selinux_state, current_sid(), isec->sid, isec->sclass, perms, audited, denied, result, &ad); if (rc) return rc; return 0; } static int selinux_inode_permission(struct inode *inode, int mask) { const struct cred *cred = current_cred(); u32 perms; bool from_access; bool no_block = mask & MAY_NOT_BLOCK; struct inode_security_struct *isec; u32 sid; struct av_decision avd; int rc, rc2; u32 audited, denied; from_access = mask & MAY_ACCESS; mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND); /* No permission to check. Existence test. */ if (!mask) return 0; validate_creds(cred); if (unlikely(IS_PRIVATE(inode))) return 0; perms = file_mask_to_av(inode->i_mode, mask); sid = cred_sid(cred); isec = inode_security_rcu(inode, no_block); if (IS_ERR(isec)) return PTR_ERR(isec); rc = avc_has_perm_noaudit(&selinux_state, sid, isec->sid, isec->sclass, perms, no_block ? AVC_NONBLOCKING : 0, &avd); audited = avc_audit_required(perms, &avd, rc, from_access ? FILE__AUDIT_ACCESS : 0, &denied); if (likely(!audited)) return rc; /* fall back to ref-walk if we have to generate audit */ if (no_block) return -ECHILD; rc2 = audit_inode_permission(inode, perms, audited, denied, rc); if (rc2) return rc2; return rc; } static int selinux_inode_setattr(struct dentry *dentry, struct iattr *iattr) { const struct cred *cred = current_cred(); struct inode *inode = d_backing_inode(dentry); unsigned int ia_valid = iattr->ia_valid; __u32 av = FILE__WRITE; /* ATTR_FORCE is just used for ATTR_KILL_S[UG]ID. */ if (ia_valid & ATTR_FORCE) { ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE | ATTR_FORCE); if (!ia_valid) return 0; } if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_TIMES_SET)) return dentry_has_perm(cred, dentry, FILE__SETATTR); if (selinux_policycap_openperm() && inode->i_sb->s_magic != SOCKFS_MAGIC && (ia_valid & ATTR_SIZE) && !(ia_valid & ATTR_FILE)) av |= FILE__OPEN; return dentry_has_perm(cred, dentry, av); } static int selinux_inode_getattr(const struct path *path) { return path_has_perm(current_cred(), path, FILE__GETATTR); } static bool has_cap_mac_admin(bool audit) { const struct cred *cred = current_cred(); unsigned int opts = audit ? CAP_OPT_NONE : CAP_OPT_NOAUDIT; if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, opts)) return false; if (cred_has_capability(cred, CAP_MAC_ADMIN, opts, true)) return false; return true; } static int selinux_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec; struct superblock_security_struct *sbsec; struct common_audit_data ad; u32 newsid, sid = current_sid(); int rc = 0; if (strcmp(name, XATTR_NAME_SELINUX)) { rc = cap_inode_setxattr(dentry, name, value, size, flags); if (rc) return rc; /* Not an attribute we recognize, so just check the ordinary setattr permission. */ return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } if (!selinux_initialized(&selinux_state)) return (inode_owner_or_capable(inode) ? 0 : -EPERM); sbsec = inode->i_sb->s_security; if (!(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (!inode_owner_or_capable(inode)) return -EPERM; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; isec = backing_inode_security(dentry); rc = avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, FILE__RELABELFROM, &ad); if (rc) return rc; rc = security_context_to_sid(&selinux_state, value, size, &newsid, GFP_KERNEL); if (rc == -EINVAL) { if (!has_cap_mac_admin(true)) { struct audit_buffer *ab; size_t audit_size; /* We strip a nul only if it is at the end, otherwise the * context contains a nul and we should audit that */ if (value) { const char *str = value; if (str[size - 1] == '\0') audit_size = size - 1; else audit_size = size; } else { audit_size = 0; } ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); audit_log_format(ab, "op=setxattr invalid_context="); audit_log_n_untrustedstring(ab, value, audit_size); audit_log_end(ab); return rc; } rc = security_context_to_sid_force(&selinux_state, value, size, &newsid); } if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, newsid, isec->sclass, FILE__RELABELTO, &ad); if (rc) return rc; rc = security_validate_transition(&selinux_state, isec->sid, newsid, sid, isec->sclass); if (rc) return rc; return avc_has_perm(&selinux_state, newsid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, &ad); } static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec; u32 newsid; int rc; if (strcmp(name, XATTR_NAME_SELINUX)) { /* Not an attribute we recognize, so nothing to do. */ return; } if (!selinux_initialized(&selinux_state)) { /* If we haven't even been initialized, then we can't validate * against a policy, so leave the label as invalid. It may * resolve to a valid label on the next revalidation try if * we've since initialized. */ return; } rc = security_context_to_sid_force(&selinux_state, value, size, &newsid); if (rc) { pr_err("SELinux: unable to map context to SID" "for (%s, %lu), rc=%d\n", inode->i_sb->s_id, inode->i_ino, -rc); return; } isec = backing_inode_security(dentry); spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); return; } static int selinux_inode_getxattr(struct dentry *dentry, const char *name) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__GETATTR); } static int selinux_inode_listxattr(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__GETATTR); } static int selinux_inode_removexattr(struct dentry *dentry, const char *name) { if (strcmp(name, XATTR_NAME_SELINUX)) { int rc = cap_inode_removexattr(dentry, name); if (rc) return rc; /* Not an attribute we recognize, so just check the ordinary setattr permission. */ return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } if (!selinux_initialized(&selinux_state)) return 0; /* No one is allowed to remove a SELinux security label. You can change the label, but all data must be labeled. */ return -EACCES; } static int selinux_path_notify(const struct path *path, u64 mask, unsigned int obj_type) { int ret; u32 perm; struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = *path; /* * Set permission needed based on the type of mark being set. * Performs an additional check for sb watches. */ switch (obj_type) { case FSNOTIFY_OBJ_TYPE_VFSMOUNT: perm = FILE__WATCH_MOUNT; break; case FSNOTIFY_OBJ_TYPE_SB: perm = FILE__WATCH_SB; ret = superblock_has_perm(current_cred(), path->dentry->d_sb, FILESYSTEM__WATCH, &ad); if (ret) return ret; break; case FSNOTIFY_OBJ_TYPE_INODE: perm = FILE__WATCH; break; default: return -EINVAL; } /* blocking watches require the file:watch_with_perm permission */ if (mask & (ALL_FSNOTIFY_PERM_EVENTS)) perm |= FILE__WATCH_WITH_PERM; /* watches on read-like events need the file:watch_reads permission */ if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE)) perm |= FILE__WATCH_READS; return path_has_perm(current_cred(), path, perm); } /* * Copy the inode security context value to the user. * * Permission check is handled by selinux_inode_getxattr hook. */ static int selinux_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc) { u32 size; int error; char *context = NULL; struct inode_security_struct *isec; /* * If we're not initialized yet, then we can't validate contexts, so * just let vfs_getxattr fall back to using the on-disk xattr. */ if (!selinux_initialized(&selinux_state) || strcmp(name, XATTR_SELINUX_SUFFIX)) return -EOPNOTSUPP; /* * If the caller has CAP_MAC_ADMIN, then get the raw context * value even if it is not defined by current policy; otherwise, * use the in-core value under current policy. * Use the non-auditing forms of the permission checks since * getxattr may be called by unprivileged processes commonly * and lack of permission just means that we fall back to the * in-core context value, not a denial. */ isec = inode_security(inode); if (has_cap_mac_admin(false)) error = security_sid_to_context_force(&selinux_state, isec->sid, &context, &size); else error = security_sid_to_context(&selinux_state, isec->sid, &context, &size); if (error) return error; error = size; if (alloc) { *buffer = context; goto out_nofree; } kfree(context); out_nofree: return error; } static int selinux_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct inode_security_struct *isec = inode_security_novalidate(inode); struct superblock_security_struct *sbsec = inode->i_sb->s_security; u32 newsid; int rc; if (strcmp(name, XATTR_SELINUX_SUFFIX)) return -EOPNOTSUPP; if (!(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (!value || !size) return -EACCES; rc = security_context_to_sid(&selinux_state, value, size, &newsid, GFP_KERNEL); if (rc) return rc; spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); return 0; } static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size) { const int len = sizeof(XATTR_NAME_SELINUX); if (!selinux_initialized(&selinux_state)) return 0; if (buffer && len <= buffer_size) memcpy(buffer, XATTR_NAME_SELINUX, len); return len; } static void selinux_inode_getsecid(struct inode *inode, u32 *secid) { struct inode_security_struct *isec = inode_security_novalidate(inode); *secid = isec->sid; } static int selinux_inode_copy_up(struct dentry *src, struct cred **new) { u32 sid; struct task_security_struct *tsec; struct cred *new_creds = *new; if (new_creds == NULL) { new_creds = prepare_creds(); if (!new_creds) return -ENOMEM; } tsec = selinux_cred(new_creds); /* Get label from overlay inode and set it in create_sid */ selinux_inode_getsecid(d_inode(src), &sid); tsec->create_sid = sid; *new = new_creds; return 0; } static int selinux_inode_copy_up_xattr(const char *name) { /* The copy_up hook above sets the initial context on an inode, but we * don't then want to overwrite it by blindly copying all the lower * xattrs up. Instead, we have to filter out SELinux-related xattrs. */ if (strcmp(name, XATTR_NAME_SELINUX) == 0) return 1; /* Discard */ /* * Any other attribute apart from SELINUX is not claimed, supported * by selinux. */ return -EOPNOTSUPP; } /* kernfs node operations */ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { const struct task_security_struct *tsec = selinux_cred(current_cred()); u32 parent_sid, newsid, clen; int rc; char *context; rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, NULL, 0); if (rc == -ENODATA) return 0; else if (rc < 0) return rc; clen = (u32)rc; context = kmalloc(clen, GFP_KERNEL); if (!context) return -ENOMEM; rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, context, clen); if (rc < 0) { kfree(context); return rc; } rc = security_context_to_sid(&selinux_state, context, clen, &parent_sid, GFP_KERNEL); kfree(context); if (rc) return rc; if (tsec->create_sid) { newsid = tsec->create_sid; } else { u16 secclass = inode_mode_to_security_class(kn->mode); struct qstr q; q.name = kn->name; q.hash_len = hashlen_string(kn_dir, kn->name); rc = security_transition_sid(&selinux_state, tsec->sid, parent_sid, secclass, &q, &newsid); if (rc) return rc; } rc = security_sid_to_context_force(&selinux_state, newsid, &context, &clen); if (rc) return rc; rc = kernfs_xattr_set(kn, XATTR_NAME_SELINUX, context, clen, XATTR_CREATE); kfree(context); return rc; } /* file security operations */ static int selinux_revalidate_file_permission(struct file *file, int mask) { const struct cred *cred = current_cred(); struct inode *inode = file_inode(file); /* file_mask_to_av won't add FILE__WRITE if MAY_APPEND is set */ if ((file->f_flags & O_APPEND) && (mask & MAY_WRITE)) mask |= MAY_APPEND; return file_has_perm(cred, file, file_mask_to_av(inode->i_mode, mask)); } static int selinux_file_permission(struct file *file, int mask) { struct inode *inode = file_inode(file); struct file_security_struct *fsec = selinux_file(file); struct inode_security_struct *isec; u32 sid = current_sid(); if (!mask) /* No permission to check. Existence test. */ return 0; isec = inode_security(inode); if (sid == fsec->sid && fsec->isid == isec->sid && fsec->pseqno == avc_policy_seqno(&selinux_state)) /* No change since file_open check. */ return 0; return selinux_revalidate_file_permission(file, mask); } static int selinux_file_alloc_security(struct file *file) { struct file_security_struct *fsec = selinux_file(file); u32 sid = current_sid(); fsec->sid = sid; fsec->fown_sid = sid; return 0; } /* * Check whether a task has the ioctl permission and cmd * operation to an inode. */ static int ioctl_has_perm(const struct cred *cred, struct file *file, u32 requested, u16 cmd) { struct common_audit_data ad; struct file_security_struct *fsec = selinux_file(file); struct inode *inode = file_inode(file); struct inode_security_struct *isec; struct lsm_ioctlop_audit ioctl; u32 ssid = cred_sid(cred); int rc; u8 driver = cmd >> 8; u8 xperm = cmd & 0xff; ad.type = LSM_AUDIT_DATA_IOCTL_OP; ad.u.op = &ioctl; ad.u.op->cmd = cmd; ad.u.op->path = file->f_path; if (ssid != fsec->sid) { rc = avc_has_perm(&selinux_state, ssid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) goto out; } if (unlikely(IS_PRIVATE(inode))) return 0; isec = inode_security(inode); rc = avc_has_extended_perms(&selinux_state, ssid, isec->sid, isec->sclass, requested, driver, xperm, &ad); out: return rc; } static int selinux_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { const struct cred *cred = current_cred(); int error = 0; switch (cmd) { case FIONREAD: case FIBMAP: case FIGETBSZ: case FS_IOC_GETFLAGS: case FS_IOC_GETVERSION: error = file_has_perm(cred, file, FILE__GETATTR); break; case FS_IOC_SETFLAGS: case FS_IOC_SETVERSION: error = file_has_perm(cred, file, FILE__SETATTR); break; /* sys_ioctl() checks */ case FIONBIO: case FIOASYNC: error = file_has_perm(cred, file, 0); break; case KDSKBENT: case KDSKBSENT: error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG, CAP_OPT_NONE, true); break; /* default case assumes that the command will go * to the file's ioctl() function. */ default: error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd); } return error; } static int default_noexec __ro_after_init; static int file_map_prot_check(struct file *file, unsigned long prot, int shared) { const struct cred *cred = current_cred(); u32 sid = cred_sid(cred); int rc = 0; if (default_noexec && (prot & PROT_EXEC) && (!file || IS_PRIVATE(file_inode(file)) || (!shared && (prot & PROT_WRITE)))) { /* * We are making executable an anonymous mapping or a * private file mapping that will also be writable. * This has an additional check. */ rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__EXECMEM, NULL); if (rc) goto error; } if (file) { /* read access is always possible with a mapping */ u32 av = FILE__READ; /* write access only matters if the mapping is shared */ if (shared && (prot & PROT_WRITE)) av |= FILE__WRITE; if (prot & PROT_EXEC) av |= FILE__EXECUTE; return file_has_perm(cred, file, av); } error: return rc; } static int selinux_mmap_addr(unsigned long addr) { int rc = 0; if (addr < CONFIG_LSM_MMAP_MIN_ADDR) { u32 sid = current_sid(); rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, NULL); } return rc; } static int selinux_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { struct common_audit_data ad; int rc; if (file) { ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; rc = inode_has_perm(current_cred(), file_inode(file), FILE__MAP, &ad); if (rc) return rc; } if (checkreqprot_get(&selinux_state)) prot = reqprot; return file_map_prot_check(file, prot, (flags & MAP_TYPE) == MAP_SHARED); } static int selinux_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { const struct cred *cred = current_cred(); u32 sid = cred_sid(cred); if (checkreqprot_get(&selinux_state)) prot = reqprot; if (default_noexec && (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) { int rc = 0; if (vma->vm_start >= vma->vm_mm->start_brk && vma->vm_end <= vma->vm_mm->brk) { rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__EXECHEAP, NULL); } else if (!vma->vm_file && ((vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) || vma_is_stack_for_current(vma))) { rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__EXECSTACK, NULL); } else if (vma->vm_file && vma->anon_vma) { /* * We are making executable a file mapping that has * had some COW done. Since pages might have been * written, check ability to execute the possibly * modified content. This typically should only * occur for text relocations. */ rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD); } if (rc) return rc; } return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED); } static int selinux_file_lock(struct file *file, unsigned int cmd) { const struct cred *cred = current_cred(); return file_has_perm(cred, file, FILE__LOCK); } static int selinux_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { const struct cred *cred = current_cred(); int err = 0; switch (cmd) { case F_SETFL: if ((file->f_flags & O_APPEND) && !(arg & O_APPEND)) { err = file_has_perm(cred, file, FILE__WRITE); break; } fallthrough; case F_SETOWN: case F_SETSIG: case F_GETFL: case F_GETOWN: case F_GETSIG: case F_GETOWNER_UIDS: /* Just check FD__USE permission */ err = file_has_perm(cred, file, 0); break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: #if BITS_PER_LONG == 32 case F_GETLK64: case F_SETLK64: case F_SETLKW64: #endif err = file_has_perm(cred, file, FILE__LOCK); break; } return err; } static void selinux_file_set_fowner(struct file *file) { struct file_security_struct *fsec; fsec = selinux_file(file); fsec->fown_sid = current_sid(); } static int selinux_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int signum) { struct file *file; u32 sid = task_sid(tsk); u32 perm; struct file_security_struct *fsec; /* struct fown_struct is never outside the context of a struct file */ file = container_of(fown, struct file, f_owner); fsec = selinux_file(file); if (!signum) perm = signal_to_av(SIGIO); /* as per send_sigio_to_task */ else perm = signal_to_av(signum); return avc_has_perm(&selinux_state, fsec->fown_sid, sid, SECCLASS_PROCESS, perm, NULL); } static int selinux_file_receive(struct file *file) { const struct cred *cred = current_cred(); return file_has_perm(cred, file, file_to_av(file)); } static int selinux_file_open(struct file *file) { struct file_security_struct *fsec; struct inode_security_struct *isec; fsec = selinux_file(file); isec = inode_security(file_inode(file)); /* * Save inode label and policy sequence number * at open-time so that selinux_file_permission * can determine whether revalidation is necessary. * Task label is already saved in the file security * struct as its SID. */ fsec->isid = isec->sid; fsec->pseqno = avc_policy_seqno(&selinux_state); /* * Since the inode label or policy seqno may have changed * between the selinux_inode_permission check and the saving * of state above, recheck that access is still permitted. * Otherwise, access might never be revalidated against the * new inode label or new policy. * This check is not redundant - do not remove. */ return file_path_has_perm(file->f_cred, file, open_file_to_av(file)); } /* task security operations */ static int selinux_task_alloc(struct task_struct *task, unsigned long clone_flags) { u32 sid = current_sid(); return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL); } /* * prepare a new set of credentials for modification */ static int selinux_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { const struct task_security_struct *old_tsec = selinux_cred(old); struct task_security_struct *tsec = selinux_cred(new); *tsec = *old_tsec; return 0; } /* * transfer the SELinux data to a blank set of creds */ static void selinux_cred_transfer(struct cred *new, const struct cred *old) { const struct task_security_struct *old_tsec = selinux_cred(old); struct task_security_struct *tsec = selinux_cred(new); *tsec = *old_tsec; } static void selinux_cred_getsecid(const struct cred *c, u32 *secid) { *secid = cred_sid(c); } /* * set the security data for a kernel service * - all the creation contexts are set to unlabelled */ static int selinux_kernel_act_as(struct cred *new, u32 secid) { struct task_security_struct *tsec = selinux_cred(new); u32 sid = current_sid(); int ret; ret = avc_has_perm(&selinux_state, sid, secid, SECCLASS_KERNEL_SERVICE, KERNEL_SERVICE__USE_AS_OVERRIDE, NULL); if (ret == 0) { tsec->sid = secid; tsec->create_sid = 0; tsec->keycreate_sid = 0; tsec->sockcreate_sid = 0; } return ret; } /* * set the file creation context in a security record to the same as the * objective context of the specified inode */ static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) { struct inode_security_struct *isec = inode_security(inode); struct task_security_struct *tsec = selinux_cred(new); u32 sid = current_sid(); int ret; ret = avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_KERNEL_SERVICE, KERNEL_SERVICE__CREATE_FILES_AS, NULL); if (ret == 0) tsec->create_sid = isec->sid; return ret; } static int selinux_kernel_module_request(char *kmod_name) { struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_KMOD; ad.u.kmod_name = kmod_name; return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__MODULE_REQUEST, &ad); } static int selinux_kernel_module_from_file(struct file *file) { struct common_audit_data ad; struct inode_security_struct *isec; struct file_security_struct *fsec; u32 sid = current_sid(); int rc; /* init_module */ if (file == NULL) return avc_has_perm(&selinux_state, sid, sid, SECCLASS_SYSTEM, SYSTEM__MODULE_LOAD, NULL); /* finit_module */ ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; fsec = selinux_file(file); if (sid != fsec->sid) { rc = avc_has_perm(&selinux_state, sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) return rc; } isec = inode_security(file_inode(file)); return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_SYSTEM, SYSTEM__MODULE_LOAD, &ad); } static int selinux_kernel_read_file(struct file *file, enum kernel_read_file_id id, bool contents) { int rc = 0; switch (id) { case READING_MODULE: rc = selinux_kernel_module_from_file(contents ? file : NULL); break; default: break; } return rc; } static int selinux_kernel_load_data(enum kernel_load_data_id id, bool contents) { int rc = 0; switch (id) { case LOADING_MODULE: rc = selinux_kernel_module_from_file(NULL); default: break; } return rc; } static int selinux_task_setpgid(struct task_struct *p, pid_t pgid) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETPGID, NULL); } static int selinux_task_getpgid(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__GETPGID, NULL); } static int selinux_task_getsid(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__GETSESSION, NULL); } static void selinux_task_getsecid(struct task_struct *p, u32 *secid) { *secid = task_sid(p); } static int selinux_task_setnice(struct task_struct *p, int nice) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_setioprio(struct task_struct *p, int ioprio) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_getioprio(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__GETSCHED, NULL); } static int selinux_task_prlimit(const struct cred *cred, const struct cred *tcred, unsigned int flags) { u32 av = 0; if (!flags) return 0; if (flags & LSM_PRLIMIT_WRITE) av |= PROCESS__SETRLIMIT; if (flags & LSM_PRLIMIT_READ) av |= PROCESS__GETRLIMIT; return avc_has_perm(&selinux_state, cred_sid(cred), cred_sid(tcred), SECCLASS_PROCESS, av, NULL); } static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) { struct rlimit *old_rlim = p->signal->rlim + resource; /* Control the ability to change the hard limit (whether lowering or raising it), so that the hard limit can later be used as a safe reset point for the soft limit upon context transitions. See selinux_bprm_committing_creds. */ if (old_rlim->rlim_max != new_rlim->rlim_max) return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETRLIMIT, NULL); return 0; } static int selinux_task_setscheduler(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_getscheduler(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__GETSCHED, NULL); } static int selinux_task_movememory(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { u32 secid; u32 perm; if (!sig) perm = PROCESS__SIGNULL; /* null signal; existence test */ else perm = signal_to_av(sig); if (!cred) secid = current_sid(); else secid = cred_sid(cred); return avc_has_perm(&selinux_state, secid, task_sid(p), SECCLASS_PROCESS, perm, NULL); } static void selinux_task_to_inode(struct task_struct *p, struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); u32 sid = task_sid(p); spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = sid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); } /* Returns error only if unable to parse addresses */ static int selinux_parse_skb_ipv4(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int offset, ihlen, ret = -EINVAL; struct iphdr _iph, *ih; offset = skb_network_offset(skb); ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) goto out; ihlen = ih->ihl * 4; if (ihlen < sizeof(_iph)) goto out; ad->u.net->v4info.saddr = ih->saddr; ad->u.net->v4info.daddr = ih->daddr; ret = 0; if (proto) *proto = ih->protocol; switch (ih->protocol) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr _dccph, *dh; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } #endif default: break; } out: return ret; } #if IS_ENABLED(CONFIG_IPV6) /* Returns error only if unable to parse addresses */ static int selinux_parse_skb_ipv6(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { u8 nexthdr; int ret = -EINVAL, offset; struct ipv6hdr _ipv6h, *ip6; __be16 frag_off; offset = skb_network_offset(skb); ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (ip6 == NULL) goto out; ad->u.net->v6info.saddr = ip6->saddr; ad->u.net->v6info.daddr = ip6->daddr; ret = 0; nexthdr = ip6->nexthdr; offset += sizeof(_ipv6h); offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) goto out; if (proto) *proto = nexthdr; switch (nexthdr) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr _dccph, *dh; dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } #endif /* includes fragments */ default: break; } out: return ret; } #endif /* IPV6 */ static int selinux_parse_skb(struct sk_buff *skb, struct common_audit_data *ad, char **_addrp, int src, u8 *proto) { char *addrp; int ret; switch (ad->u.net->family) { case PF_INET: ret = selinux_parse_skb_ipv4(skb, ad, proto); if (ret) goto parse_error; addrp = (char *)(src ? &ad->u.net->v4info.saddr : &ad->u.net->v4info.daddr); goto okay; #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: ret = selinux_parse_skb_ipv6(skb, ad, proto); if (ret) goto parse_error; addrp = (char *)(src ? &ad->u.net->v6info.saddr : &ad->u.net->v6info.daddr); goto okay; #endif /* IPV6 */ default: addrp = NULL; goto okay; } parse_error: pr_warn( "SELinux: failure in selinux_parse_skb()," " unable to parse packet\n"); return ret; okay: if (_addrp) *_addrp = addrp; return 0; } /** * selinux_skb_peerlbl_sid - Determine the peer label of a packet * @skb: the packet * @family: protocol family * @sid: the packet's peer label SID * * Description: * Check the various different forms of network peer labeling and determine * the peer label/SID for the packet; most of the magic actually occurs in * the security server function security_net_peersid_cmp(). The function * returns zero if the value in @sid is valid (although it may be SECSID_NULL) * or -EACCES if @sid is invalid due to inconsistencies with the different * peer labels. * */ static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid) { int err; u32 xfrm_sid; u32 nlbl_sid; u32 nlbl_type; err = selinux_xfrm_skb_sid(skb, &xfrm_sid); if (unlikely(err)) return -EACCES; err = selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid); if (unlikely(err)) return -EACCES; err = security_net_peersid_resolve(&selinux_state, nlbl_sid, nlbl_type, xfrm_sid, sid); if (unlikely(err)) { pr_warn( "SELinux: failure in selinux_skb_peerlbl_sid()," " unable to determine packet's peer label\n"); return -EACCES; } return 0; } /** * selinux_conn_sid - Determine the child socket label for a connection * @sk_sid: the parent socket's SID * @skb_sid: the packet's SID * @conn_sid: the resulting connection SID * * If @skb_sid is valid then the user:role:type information from @sk_sid is * combined with the MLS information from @skb_sid in order to create * @conn_sid. If @skb_sid is not valid then @conn_sid is simply a copy * of @sk_sid. Returns zero on success, negative values on failure. * */ static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid) { int err = 0; if (skb_sid != SECSID_NULL) err = security_sid_mls_copy(&selinux_state, sk_sid, skb_sid, conn_sid); else *conn_sid = sk_sid; return err; } /* socket security operations */ static int socket_sockcreate_sid(const struct task_security_struct *tsec, u16 secclass, u32 *socksid) { if (tsec->sockcreate_sid > SECSID_NULL) { *socksid = tsec->sockcreate_sid; return 0; } return security_transition_sid(&selinux_state, tsec->sid, tsec->sid, secclass, NULL, socksid); } static int sock_has_perm(struct sock *sk, u32 perms) { struct sk_security_struct *sksec = sk->sk_security; struct common_audit_data ad; struct lsm_network_audit net = {0,}; if (sksec->sid == SECINITSID_KERNEL) return 0; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sk = sk; return avc_has_perm(&selinux_state, current_sid(), sksec->sid, sksec->sclass, perms, &ad); } static int selinux_socket_create(int family, int type, int protocol, int kern) { const struct task_security_struct *tsec = selinux_cred(current_cred()); u32 newsid; u16 secclass; int rc; if (kern) return 0; secclass = socket_type_to_security_class(family, type, protocol); rc = socket_sockcreate_sid(tsec, secclass, &newsid); if (rc) return rc; return avc_has_perm(&selinux_state, tsec->sid, newsid, secclass, SOCKET__CREATE, NULL); } static int selinux_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { const struct task_security_struct *tsec = selinux_cred(current_cred()); struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock)); struct sk_security_struct *sksec; u16 sclass = socket_type_to_security_class(family, type, protocol); u32 sid = SECINITSID_KERNEL; int err = 0; if (!kern) { err = socket_sockcreate_sid(tsec, sclass, &sid); if (err) return err; } isec->sclass = sclass; isec->sid = sid; isec->initialized = LABEL_INITIALIZED; if (sock->sk) { sksec = sock->sk->sk_security; sksec->sclass = sclass; sksec->sid = sid; /* Allows detection of the first association on this socket */ if (sksec->sclass == SECCLASS_SCTP_SOCKET) sksec->sctp_assoc_state = SCTP_ASSOC_UNSET; err = selinux_netlbl_socket_post_create(sock->sk, family); } return err; } static int selinux_socket_socketpair(struct socket *socka, struct socket *sockb) { struct sk_security_struct *sksec_a = socka->sk->sk_security; struct sk_security_struct *sksec_b = sockb->sk->sk_security; sksec_a->peer_sid = sksec_b->sid; sksec_b->peer_sid = sksec_a->sid; return 0; } /* Range of port numbers used to automatically bind. Need to determine whether we should perform a name_bind permission check between the socket and the port number. */ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { struct sock *sk = sock->sk; struct sk_security_struct *sksec = sk->sk_security; u16 family; int err; err = sock_has_perm(sk, SOCKET__BIND); if (err) goto out; /* If PF_INET or PF_INET6, check name_bind permission for the port. */ family = sk->sk_family; if (family == PF_INET || family == PF_INET6) { char *addrp; struct common_audit_data ad; struct lsm_network_audit net = {0,}; struct sockaddr_in *addr4 = NULL; struct sockaddr_in6 *addr6 = NULL; u16 family_sa; unsigned short snum; u32 sid, node_perm; /* * sctp_bindx(3) calls via selinux_sctp_bind_connect() * that validates multiple binding addresses. Because of this * need to check address->sa_family as it is possible to have * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET. */ if (addrlen < offsetofend(struct sockaddr, sa_family)) return -EINVAL; family_sa = address->sa_family; switch (family_sa) { case AF_UNSPEC: case AF_INET: if (addrlen < sizeof(struct sockaddr_in)) return -EINVAL; addr4 = (struct sockaddr_in *)address; if (family_sa == AF_UNSPEC) { /* see __inet_bind(), we only want to allow * AF_UNSPEC if the address is INADDR_ANY */ if (addr4->sin_addr.s_addr != htonl(INADDR_ANY)) goto err_af; family_sa = AF_INET; } snum = ntohs(addr4->sin_port); addrp = (char *)&addr4->sin_addr.s_addr; break; case AF_INET6: if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; addr6 = (struct sockaddr_in6 *)address; snum = ntohs(addr6->sin6_port); addrp = (char *)&addr6->sin6_addr.s6_addr; break; default: goto err_af; } ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sport = htons(snum); ad.u.net->family = family_sa; if (snum) { int low, high; inet_get_local_port_range(sock_net(sk), &low, &high); if (inet_port_requires_bind_service(sock_net(sk), snum) || snum < low || snum > high) { err = sel_netport_sid(sk->sk_protocol, snum, &sid); if (err) goto out; err = avc_has_perm(&selinux_state, sksec->sid, sid, sksec->sclass, SOCKET__NAME_BIND, &ad); if (err) goto out; } } switch (sksec->sclass) { case SECCLASS_TCP_SOCKET: node_perm = TCP_SOCKET__NODE_BIND; break; case SECCLASS_UDP_SOCKET: node_perm = UDP_SOCKET__NODE_BIND; break; case SECCLASS_DCCP_SOCKET: node_perm = DCCP_SOCKET__NODE_BIND; break; case SECCLASS_SCTP_SOCKET: node_perm = SCTP_SOCKET__NODE_BIND; break; default: node_perm = RAWIP_SOCKET__NODE_BIND; break; } err = sel_netnode_sid(addrp, family_sa, &sid); if (err) goto out; if (family_sa == AF_INET) ad.u.net->v4info.saddr = addr4->sin_addr.s_addr; else ad.u.net->v6info.saddr = addr6->sin6_addr; err = avc_has_perm(&selinux_state, sksec->sid, sid, sksec->sclass, node_perm, &ad); if (err) goto out; } out: return err; err_af: /* Note that SCTP services expect -EINVAL, others -EAFNOSUPPORT. */ if (sksec->sclass == SECCLASS_SCTP_SOCKET) return -EINVAL; return -EAFNOSUPPORT; } /* This supports connect(2) and SCTP connect services such as sctp_connectx(3) * and sctp_sendmsg(3) as described in Documentation/security/SCTP.rst */ static int selinux_socket_connect_helper(struct socket *sock, struct sockaddr *address, int addrlen) { struct sock *sk = sock->sk; struct sk_security_struct *sksec = sk->sk_security; int err; err = sock_has_perm(sk, SOCKET__CONNECT); if (err) return err; if (addrlen < offsetofend(struct sockaddr, sa_family)) return -EINVAL; /* connect(AF_UNSPEC) has special handling, as it is a documented * way to disconnect the socket */ if (address->sa_family == AF_UNSPEC) return 0; /* * If a TCP, DCCP or SCTP socket, check name_connect permission * for the port. */ if (sksec->sclass == SECCLASS_TCP_SOCKET || sksec->sclass == SECCLASS_DCCP_SOCKET || sksec->sclass == SECCLASS_SCTP_SOCKET) { struct common_audit_data ad; struct lsm_network_audit net = {0,}; struct sockaddr_in *addr4 = NULL; struct sockaddr_in6 *addr6 = NULL; unsigned short snum; u32 sid, perm; /* sctp_connectx(3) calls via selinux_sctp_bind_connect() * that validates multiple connect addresses. Because of this * need to check address->sa_family as it is possible to have * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET. */ switch (address->sa_family) { case AF_INET: addr4 = (struct sockaddr_in *)address; if (addrlen < sizeof(struct sockaddr_in)) return -EINVAL; snum = ntohs(addr4->sin_port); break; case AF_INET6: addr6 = (struct sockaddr_in6 *)address; if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; snum = ntohs(addr6->sin6_port); break; default: /* Note that SCTP services expect -EINVAL, whereas * others expect -EAFNOSUPPORT. */ if (sksec->sclass == SECCLASS_SCTP_SOCKET) return -EINVAL; else return -EAFNOSUPPORT; } err = sel_netport_sid(sk->sk_protocol, snum, &sid); if (err) return err; switch (sksec->sclass) { case SECCLASS_TCP_SOCKET: perm = TCP_SOCKET__NAME_CONNECT; break; case SECCLASS_DCCP_SOCKET: perm = DCCP_SOCKET__NAME_CONNECT; break; case SECCLASS_SCTP_SOCKET: perm = SCTP_SOCKET__NAME_CONNECT; break;