1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 // SPDX-License-Identifier: GPL-2.0 /* * Tag allocation using scalable bitmaps. Uses active queue tracking to support * fairer distribution of tags between multiple submitters when a shared tag map * is used. * * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/blk-mq.h> #include <linux/delay.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail * to get tag when first time, the other shared-tag users could reserve * budget for it. */ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_sbitmap_shared(hctx->flags)) { struct request_queue *q = hctx->queue; struct blk_mq_tag_set *set = q->tag_set; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) atomic_inc(&set->active_queues_shared_sbitmap); } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) atomic_inc(&hctx->tags->active_queues); } return true; } /* * Wakeup all potentially sleeping on tags */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { sbitmap_queue_wake_all(tags->bitmap_tags); if (include_reserve) sbitmap_queue_wake_all(tags->breserved_tags); } /* * If a previously busy queue goes inactive, potential waiters could now * be allowed to queue. Wake them up and check. */ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; struct request_queue *q = hctx->queue; struct blk_mq_tag_set *set = q->tag_set; if (blk_mq_is_sbitmap_shared(hctx->flags)) { if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; atomic_dec(&set->active_queues_shared_sbitmap); } else { if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; atomic_dec(&tags->active_queues); } blk_mq_tag_wakeup_all(tags, false); } static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && !hctx_may_queue(data->hctx, bt)) return BLK_MQ_NO_TAG; if (data->shallow_depth) return __sbitmap_queue_get_shallow(bt, data->shallow_depth); else return __sbitmap_queue_get(bt); } unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt; struct sbq_wait_state *ws; DEFINE_SBQ_WAIT(wait); unsigned int tag_offset; int tag; if (data->flags & BLK_MQ_REQ_RESERVED) { if (unlikely(!tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_NO_TAG; } bt = tags->breserved_tags; tag_offset = 0; } else { bt = tags->bitmap_tags; tag_offset = tags->nr_reserved_tags; } tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) goto found_tag; if (data->flags & BLK_MQ_REQ_NOWAIT) return BLK_MQ_NO_TAG; ws = bt_wait_ptr(bt, data->hctx); do { struct sbitmap_queue *bt_prev; /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for * some to complete. */ blk_mq_run_hw_queue(data->hctx, false); /* * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) break; sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) break; bt_prev = bt; io_schedule(); sbitmap_finish_wait(bt, ws, &wait); data->ctx = blk_mq_get_ctx(data->q); data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = tags->breserved_tags; else bt = tags->bitmap_tags; /* * If destination hw queue is changed, fake wake up on * previous queue for compensating the wake up miss, so * other allocations on previous queue won't be starved. */ if (bt != bt_prev) sbitmap_queue_wake_up(bt_prev); ws = bt_wait_ptr(bt, data->hctx); } while (1); sbitmap_finish_wait(bt, ws, &wait); found_tag: /* * Give up this allocation if the hctx is inactive. The caller will * retry on an active hctx. */ if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { blk_mq_put_tag(tags, data->ctx, tag + tag_offset); return BLK_MQ_NO_TAG; } return tag + tag_offset; } void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag) { if (!blk_mq_tag_is_reserved(tags, tag)) { const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu); } else { BUG_ON(tag >= tags->nr_reserved_tags); sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu); } } struct bt_iter_data { struct blk_mq_hw_ctx *hctx; busy_iter_fn *fn; void *data; bool reserved; }; static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { struct request *rq; unsigned long flags; spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref)) rq = NULL; spin_unlock_irqrestore(&tags->lock, flags); return rq; } static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; struct blk_mq_tags *tags = hctx->tags; bool reserved = iter_data->reserved; struct request *rq; bool ret = true; if (!reserved) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; if (rq->q == hctx->queue && rq->mq_hctx == hctx) ret = iter_data->fn(hctx, rq, iter_data->data, reserved); blk_mq_put_rq_ref(rq); return ret; } /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request * associated with @hctx that has been assigned a driver tag. * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) * where rq is a pointer to a request. Return true to continue * iterating tags, false to stop. * @data: Will be passed as third argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, busy_iter_fn *fn, void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, }; sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); } struct bt_tags_iter_data { struct blk_mq_tags *tags; busy_tag_iter_fn *fn; void *data; unsigned int flags; }; #define BT_TAG_ITER_RESERVED (1 << 0) #define BT_TAG_ITER_STARTED (1 << 1) #define BT_TAG_ITER_STATIC_RQS (1 << 2) static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_tags_iter_data *iter_data = data; struct blk_mq_tags *tags = iter_data->tags; bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED; struct request *rq; bool ret = true; bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); if (!reserved) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ if (iter_static_rqs) rq = tags->static_rqs[bitnr]; else rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; if (!(iter_data->flags & BT_TAG_ITER_STARTED) || blk_mq_request_started(rq)) ret = iter_data->fn(rq, iter_data->data, reserved); if (!iter_static_rqs) blk_mq_put_rq_ref(rq); return ret; } /** * bt_tags_for_each - iterate over the requests in a tag map * @tags: Tag map to iterate over. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @data, * @reserved) where rq is a pointer to a request. Return true * to continue iterating tags, false to stop. * @data: Will be passed as second argument to @fn. * @flags: BT_TAG_ITER_* */ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, busy_tag_iter_fn *fn, void *data, unsigned int flags) { struct bt_tags_iter_data iter_data = { .tags = tags, .fn = fn, .data = data, .flags = flags, }; if (tags->rqs) sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); } static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv, unsigned int flags) { WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); if (tags->nr_reserved_tags) bt_tags_for_each(tags, tags->breserved_tags, fn, priv, flags | BT_TAG_ITER_RESERVED); bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags); } /** * blk_mq_all_tag_iter - iterate over all requests in a tag map * @tags: Tag map to iterate over. * @fn: Pointer to the function that will be called for each * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' * indicates whether or not @rq is a reserved request. Return * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. * * Caller has to pass the tag map from which requests are allocated. */ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv) { __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS); } /** * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set * @tagset: Tag set to iterate over. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' * indicates whether or not @rq is a reserved request. Return * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. * * We grab one request reference before calling @fn and release it after * @fn returns. */ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { int i; for (i = 0; i < tagset->nr_hw_queues; i++) { if (tagset->tags && tagset->tags[i]) __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); } } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data, bool reserved) { unsigned *count = data; if (blk_mq_request_completed(rq)) (*count)++; return true; } /** * blk_mq_tagset_wait_completed_request - wait until all completed req's * complete funtion is run * @tagset: Tag set to drain completed request * * Note: This function has to be run after all IO queues are shutdown */ void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) { while (true) { unsigned count = 0; blk_mq_tagset_busy_iter(tagset, blk_mq_tagset_count_completed_rqs, &count); if (!count) break; msleep(5); } } EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); /** * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag * @q: Request queue to examine. * @fn: Pointer to the function that will be called for each request * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, * reserved) where rq is a pointer to a request and hctx points * to the hardware queue associated with the request. 'reserved' * indicates whether or not @rq is a reserved request. * @priv: Will be passed as third argument to @fn. * * Note: if @q->tag_set is shared with other request queues then @fn will be * called for all requests on all queues that share that tag set and not only * for requests associated with @q. */ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { struct blk_mq_hw_ctx *hctx; int i; /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx * while the queue is frozen. So we can use q_usage_counter to avoid * racing with it. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; /* * If no software queues are currently mapped to this * hardware queue, there's nothing to check */ if (!blk_mq_hw_queue_mapped(hctx)) continue; if (tags->nr_reserved_tags) bt_for_each(hctx, tags->breserved_tags, fn, priv, true); bt_for_each(hctx, tags->bitmap_tags, fn, priv, false); } blk_queue_exit(q); } static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, bool round_robin, int node) { return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, node); } static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, int node, int alloc_policy) { unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node)) return -ENOMEM; if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags, round_robin, node)) goto free_bitmap_tags; tags->bitmap_tags = &tags->__bitmap_tags; tags->breserved_tags = &tags->__breserved_tags; return 0; free_bitmap_tags: sbitmap_queue_free(&tags->__bitmap_tags); return -ENOMEM; } int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) { unsigned int depth = set->queue_depth - set->reserved_tags; int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; int i, node = set->numa_node; if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node)) return -ENOMEM; if (bt_alloc(&set->__breserved_tags, set->reserved_tags, round_robin, node)) goto free_bitmap_tags; for (i = 0; i < set->nr_hw_queues; i++) { struct blk_mq_tags *tags = set->tags[i]; tags->bitmap_tags = &set->__bitmap_tags; tags->breserved_tags = &set->__breserved_tags; } return 0; free_bitmap_tags: sbitmap_queue_free(&set->__bitmap_tags); return -ENOMEM; } void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) { sbitmap_queue_free(&set->__bitmap_tags); sbitmap_queue_free(&set->__breserved_tags); } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node, unsigned int flags) { int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags); struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { pr_err("blk-mq: tag depth too large\n"); return NULL; } tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); if (!tags) return NULL; tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); if (flags & BLK_MQ_F_TAG_HCTX_SHARED) return tags; if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { kfree(tags); return NULL; } return tags; } void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) { if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) { sbitmap_queue_free(tags->bitmap_tags); sbitmap_queue_free(tags->breserved_tags); } kfree(tags); } int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tagsptr, unsigned int tdepth, bool can_grow) { struct blk_mq_tags *tags = *tagsptr; if (tdepth <= tags->nr_reserved_tags) return -EINVAL; /* * If we are allowed to grow beyond the original size, allocate * a new set of tags before freeing the old one. */ if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; /* Only sched tags can grow, so clear HCTX_SHARED flag */ unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; struct blk_mq_tags *new; bool ret; if (!can_grow) return -EINVAL; /* * We need some sort of upper limit, set it high enough that * no valid use cases should require more. */ if (tdepth > 16 * BLKDEV_MAX_RQ) return -EINVAL; new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, tags->nr_reserved_tags, flags); if (!new) return -ENOMEM; ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); if (ret) { blk_mq_free_rq_map(new, flags); return -ENOMEM; } blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); blk_mq_free_rq_map(*tagsptr, flags); *tagsptr = new; } else { /* * Don't need (or can't) update reserved tags here, they * remain static and should never need resizing. */ sbitmap_queue_resize(tags->bitmap_tags, tdepth - tags->nr_reserved_tags); } return 0; } void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size) { sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags); } /** * blk_mq_unique_tag() - return a tag that is unique queue-wide * @rq: request for which to compute a unique tag * * The tag field in struct request is unique per hardware queue but not over * all hardware queues. Hence this function that returns a tag with the * hardware context index in the upper bits and the per hardware queue tag in * the lower bits. * * Note: When called for a request that is queued on a non-multiqueue request * queue, the hardware context index is set to zero. */ u32 blk_mq_unique_tag(struct request *rq) { return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); } EXPORT_SYMBOL(blk_mq_unique_tag);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_H #define _LINUX_RCULIST_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list.h> #include <linux/rcupdate.h> /* * Why is there no list_empty_rcu()? Because list_empty() serves this * purpose. The list_empty() function fetches the RCU-protected pointer * and compares it to the address of the list head, but neither dereferences * this pointer itself nor provides this pointer to the caller. Therefore, * it is not necessary to use rcu_dereference(), so that list_empty() can * be used anywhere you would want to use a list_empty_rcu(). */ /* * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers * @list: list to be initialized * * You should instead use INIT_LIST_HEAD() for normal initialization and * cleanup tasks, when readers have no access to the list being initialized. * However, if the list being initialized is visible to readers, you * need to keep the compiler from being too mischievous. */ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } /* * return the ->next pointer of a list_head in an rcu safe * way, we must not access it directly */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) /** * list_tail_rcu - returns the prev pointer of the head of the list * @head: the head of the list * * Note: This should only be used with the list header, and even then * only if list_del() and similar primitives are not also used on the * list header. */ #define list_tail_rcu(head) (*((struct list_head __rcu **)(&(head)->prev))) /* * Check during list traversal that we are within an RCU reader */ #define check_arg_count_one(dummy) #ifdef CONFIG_PROVE_RCU_LIST #define __list_check_rcu(dummy, cond, extra...) \ ({ \ check_arg_count_one(extra); \ RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \ "RCU-list traversed in non-reader section!"); \ }) #define __list_check_srcu(cond) \ ({ \ RCU_LOCKDEP_WARN(!(cond), \ "RCU-list traversed without holding the required lock!");\ }) #else #define __list_check_rcu(dummy, cond, extra...) \ ({ check_arg_count_one(extra); }) #define __list_check_srcu(cond) ({ }) #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; new->next = next; new->prev = prev; rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } /** * list_add_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head, head->next); } /** * list_add_tail_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_tail_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_tail_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head->prev, head); } /** * list_del_rcu - deletes entry from list without re-initialization * @entry: the element to delete from the list. * * Note: list_empty() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_del_rcu() * or list_add_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). * * Note that the caller is not permitted to immediately free * the newly deleted entry. Instead, either synchronize_rcu() * or call_rcu() must be used to defer freeing until an RCU * grace period has elapsed. */ static inline void list_del_rcu(struct list_head *entry) { __list_del_entry(entry); entry->prev = LIST_POISON2; } /** * hlist_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_add_head_rcu() or * hlist_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_for_each_entry_rcu(). */ static inline void hlist_del_init_rcu(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * list_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically. * Note: @old should not be empty. */ static inline void list_replace_rcu(struct list_head *old, struct list_head *new) { new->next = old->next; new->prev = old->prev; rcu_assign_pointer(list_next_rcu(new->prev), new); new->next->prev = new; old->prev = LIST_POISON2; } /** * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice * @prev: points to the last element of the existing list * @next: points to the first element of the existing list * @sync: synchronize_rcu, synchronize_rcu_expedited, ... * * The list pointed to by @prev and @next can be RCU-read traversed * concurrently with this function. * * Note that this function blocks. * * Important note: the caller must take whatever action is necessary to prevent * any other updates to the existing list. In principle, it is possible to * modify the list as soon as sync() begins execution. If this sort of thing * becomes necessary, an alternative version based on call_rcu() could be * created. But only if -really- needed -- there is no shortage of RCU API * members. */ static inline void __list_splice_init_rcu(struct list_head *list, struct list_head *prev, struct list_head *next, void (*sync)(void)) { struct list_head *first = list->next; struct list_head *last = list->prev; /* * "first" and "last" tracking list, so initialize it. RCU readers * have access to this list, so we must use INIT_LIST_HEAD_RCU() * instead of INIT_LIST_HEAD(). */ INIT_LIST_HEAD_RCU(list); /* * At this point, the list body still points to the source list. * Wait for any readers to finish using the list before splicing * the list body into the new list. Any new readers will see * an empty list. */ sync(); ASSERT_EXCLUSIVE_ACCESS(*first); ASSERT_EXCLUSIVE_ACCESS(*last); /* * Readers are finished with the source list, so perform splice. * The order is important if the new list is global and accessible * to concurrent RCU readers. Note that RCU readers are not * permitted to traverse the prev pointers without excluding * this function. */ last->next = next; rcu_assign_pointer(list_next_rcu(prev), first); first->prev = prev; next->prev = last; } /** * list_splice_init_rcu - splice an RCU-protected list into an existing list, * designed for stacks. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head, head->next, sync); } /** * list_splice_tail_init_rcu - splice an RCU-protected list into an existing * list, designed for queues. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_tail_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head->prev, head, sync); } /** * list_entry_rcu - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? * * Implementing those functions following their counterparts list_empty() and * list_first_entry() is not advisable because they lead to subtle race * conditions as the following snippet shows: * * if (!list_empty_rcu(mylist)) { * struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member); * do_something(bar); * } * * The list may not be empty when list_empty_rcu checks it, but it may be when * list_first_entry_rcu rereads the ->next pointer. * * Rereading the ->next pointer is not a problem for list_empty() and * list_first_entry() because they would be protected by a lock that blocks * writers. * * See list_first_or_null_rcu for an alternative. */ /** * list_first_or_null_rcu - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_first_or_null_rcu(ptr, type, member) \ ({ \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \ }) /** * list_next_or_null_rcu - get the first element from a list * @head: the head for the list. * @ptr: the list head to take the next element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the ptr is at the end of the list, NULL is returned. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_next_or_null_rcu(head, ptr, type, member) \ ({ \ struct list_head *__head = (head); \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__next != __head) ? list_entry_rcu(__next, type, \ member) : NULL; \ }) /** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define list_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_entry_lockless - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_entry_lockless(ptr, type, member) \ container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_for_each_entry_lockless(pos, head, member) \ for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_lockless(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_continue_rcu - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position which must have been in the list when the RCU read * lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_from_rcu() except * this starts after the given position and that one starts at the given * position. */ #define list_for_each_entry_continue_rcu(pos, head, member) \ for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_from_rcu - iterate over a list from current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_node within the struct. * * Iterate over the tail of a list starting from a given position, * which must have been in the list when the RCU read lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_continue_rcu() except * this starts from the given position and that one starts from the position * after the given position. */ #define list_for_each_entry_from_rcu(pos, head, member) \ for (; &(pos)->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member)) /** * hlist_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry(). */ static inline void hlist_del_rcu(struct hlist_node *n) { __hlist_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically. */ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new) { struct hlist_node *next = old->next; new->next = next; WRITE_ONCE(new->pprev, old->pprev); rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) WRITE_ONCE(new->next->pprev, &new->next); WRITE_ONCE(old->pprev, LIST_POISON2); } /** * hlists_swap_heads_rcu - swap the lists the hlist heads point to * @left: The hlist head on the left * @right: The hlist head on the right * * The lists start out as [@left ][node1 ... ] and * [@right ][node2 ... ] * The lists end up as [@left ][node2 ... ] * [@right ][node1 ... ] */ static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right) { struct hlist_node *node1 = left->first; struct hlist_node *node2 = right->first; rcu_assign_pointer(left->first, node2); rcu_assign_pointer(right->first, node1); WRITE_ONCE(node2->pprev, &left->first); WRITE_ONCE(node1->pprev, &right->first); } /* * return the first or the next element in an RCU protected hlist */ #define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) #define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) #define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) /** * hlist_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_first_rcu(h), n); if (first) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_tail_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; i; i = i->next) last = i; if (last) { n->next = last->next; WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_next_rcu(last), n); } else { hlist_add_head_rcu(n, h); } } /** * hlist_add_before_rcu * @n: the new element to add to the hash list. * @next: the existing element to add the new element before. * * Description: * Adds the specified element to the specified hlist * before the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_before_rcu(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); n->next = next; rcu_assign_pointer(hlist_pprev_rcu(n), n); WRITE_ONCE(next->pprev, &n->next); } /** * hlist_add_behind_rcu * @n: the new element to add to the hash list. * @prev: the existing element to add the new element after. * * Description: * Adds the specified element to the specified hlist * after the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_behind_rcu(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; WRITE_ONCE(n->pprev, &prev->next); rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } #define __hlist_for_each_rcu(pos, head) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos; \ pos = rcu_dereference(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define hlist_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). * * This is the same as hlist_for_each_entry_rcu() except that it does * not do any RCU debugging or tracing. */ #define hlist_for_each_entry_rcu_notrace(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu_bh(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu_bh(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from_rcu(pos, member) \ for (; pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) #endif /* __KERNEL__ */ #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Jens Axboe <axboe@suse.de> */ #include <linux/compat.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/module.h> #include <linux/blkdev.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/cdrom.h> #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/times.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsi_cmnd.h> #include <scsi/sg.h> struct blk_cmd_filter { unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; }; static struct blk_cmd_filter blk_default_cmd_filter; /* Command group 3 is reserved and should never be used. */ const unsigned char scsi_command_size_tbl[8] = { 6, 10, 10, 12, 16, 12, 10, 10 }; EXPORT_SYMBOL(scsi_command_size_tbl); static int sg_get_version(int __user *p) { static const int sg_version_num = 30527; return put_user(sg_version_num, p); } static int scsi_get_idlun(struct request_queue *q, int __user *p) { return put_user(0, p); } static int scsi_get_bus(struct request_queue *q, int __user *p) { return put_user(0, p); } static int sg_get_timeout(struct request_queue *q) { return jiffies_to_clock_t(q->sg_timeout); } static int sg_set_timeout(struct request_queue *q, int __user *p) { int timeout, err = get_user(timeout, p); if (!err) q->sg_timeout = clock_t_to_jiffies(timeout); return err; } static int max_sectors_bytes(struct request_queue *q) { unsigned int max_sectors = queue_max_sectors(q); max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9); return max_sectors << 9; } static int sg_get_reserved_size(struct request_queue *q, int __user *p) { int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q)); return put_user(val, p); } static int sg_set_reserved_size(struct request_queue *q, int __user *p) { int size, err = get_user(size, p); if (err) return err; if (size < 0) return -EINVAL; q->sg_reserved_size = min(size, max_sectors_bytes(q)); return 0; } /* * will always return that we are ATAPI even for a real SCSI drive, I'm not * so sure this is worth doing anything about (why would you care??) */ static int sg_emulated_host(struct request_queue *q, int __user *p) { return put_user(1, p); } static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) { /* Basic read-only commands */ __set_bit(TEST_UNIT_READY, filter->read_ok); __set_bit(REQUEST_SENSE, filter->read_ok); __set_bit(READ_6, filter->read_ok); __set_bit(READ_10, filter->read_ok); __set_bit(READ_12, filter->read_ok); __set_bit(READ_16, filter->read_ok); __set_bit(READ_BUFFER, filter->read_ok); __set_bit(READ_DEFECT_DATA, filter->read_ok); __set_bit(READ_CAPACITY, filter->read_ok); __set_bit(READ_LONG, filter->read_ok); __set_bit(INQUIRY, filter->read_ok); __set_bit(MODE_SENSE, filter->read_ok); __set_bit(MODE_SENSE_10, filter->read_ok); __set_bit(LOG_SENSE, filter->read_ok); __set_bit(START_STOP, filter->read_ok); __set_bit(GPCMD_VERIFY_10, filter->read_ok); __set_bit(VERIFY_16, filter->read_ok); __set_bit(REPORT_LUNS, filter->read_ok); __set_bit(SERVICE_ACTION_IN_16, filter->read_ok); __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok); __set_bit(MAINTENANCE_IN, filter->read_ok); __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); /* Audio CD commands */ __set_bit(GPCMD_PLAY_CD, filter->read_ok); __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok); __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok); __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok); __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok); /* CD/DVD data reading */ __set_bit(GPCMD_READ_CD, filter->read_ok); __set_bit(GPCMD_READ_CD_MSF, filter->read_ok); __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok); __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok); __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok); __set_bit(GPCMD_READ_HEADER, filter->read_ok); __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok); __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok); __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok); __set_bit(GPCMD_REPORT_KEY, filter->read_ok); __set_bit(GPCMD_SCAN, filter->read_ok); __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok); __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok); __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok); __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok); __set_bit(GPCMD_SEEK, filter->read_ok); __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok); /* Basic writing commands */ __set_bit(WRITE_6, filter->write_ok); __set_bit(WRITE_10, filter->write_ok); __set_bit(WRITE_VERIFY, filter->write_ok); __set_bit(WRITE_12, filter->write_ok); __set_bit(WRITE_VERIFY_12, filter->write_ok); __set_bit(WRITE_16, filter->write_ok); __set_bit(WRITE_LONG, filter->write_ok); __set_bit(WRITE_LONG_2, filter->write_ok); __set_bit(WRITE_SAME, filter->write_ok); __set_bit(WRITE_SAME_16, filter->write_ok); __set_bit(WRITE_SAME_32, filter->write_ok); __set_bit(ERASE, filter->write_ok); __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); __set_bit(MODE_SELECT, filter->write_ok); __set_bit(LOG_SELECT, filter->write_ok); __set_bit(GPCMD_BLANK, filter->write_ok); __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok); __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok); __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok); __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok); __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok); __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok); __set_bit(GPCMD_SEND_EVENT, filter->write_ok); __set_bit(GPCMD_SEND_KEY, filter->write_ok); __set_bit(GPCMD_SEND_OPC, filter->write_ok); __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok); __set_bit(GPCMD_SET_SPEED, filter->write_ok); __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok); __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok); __set_bit(GPCMD_SET_STREAMING, filter->write_ok); __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); /* ZBC Commands */ __set_bit(ZBC_OUT, filter->write_ok); __set_bit(ZBC_IN, filter->read_ok); } int blk_verify_command(unsigned char *cmd, fmode_t mode) { struct blk_cmd_filter *filter = &blk_default_cmd_filter; /* root can do any command. */ if (capable(CAP_SYS_RAWIO)) return 0; /* Anybody who can open the device can do a read-safe command */ if (test_bit(cmd[0], filter->read_ok)) return 0; /* Write-safe commands require a writable open */ if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE)) return 0; return -EPERM; } EXPORT_SYMBOL(blk_verify_command); static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, struct sg_io_hdr *hdr, fmode_t mode) { struct scsi_request *req = scsi_req(rq); if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; if (blk_verify_command(req->cmd, mode)) return -EPERM; /* * fill in request structure */ req->cmd_len = hdr->cmd_len; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) rq->timeout = q->sg_timeout; if (!rq->timeout) rq->timeout = BLK_DEFAULT_SG_TIMEOUT; if (rq->timeout < BLK_MIN_SG_TIMEOUT) rq->timeout = BLK_MIN_SG_TIMEOUT; return 0; } static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, struct bio *bio) { struct scsi_request *req = scsi_req(rq); int r, ret = 0; /* * fill in all the output members */ hdr->status = req->result & 0xff; hdr->masked_status = status_byte(req->result); hdr->msg_status = msg_byte(req->result); hdr->host_status = host_byte(req->result); hdr->driver_status = driver_byte(req->result); hdr->info = 0; if (hdr->masked_status || hdr->host_status || hdr->driver_status) hdr->info |= SG_INFO_CHECK; hdr->resid = req->resid_len; hdr->sb_len_wr = 0; if (req->sense_len && hdr->sbp) { int len = min((unsigned int) hdr->mx_sb_len, req->sense_len); if (!copy_to_user(hdr->sbp, req->sense, len)) hdr->sb_len_wr = len; else ret = -EFAULT; } r = blk_rq_unmap_user(bio); if (!ret) ret = r; return ret; } static int sg_io(struct request_queue *q, struct gendisk *bd_disk, struct sg_io_hdr *hdr, fmode_t mode) { unsigned long start_time; ssize_t ret = 0; int writing = 0; int at_head = 0; struct request *rq; struct scsi_request *req; struct bio *bio; if (hdr->interface_id != 'S') return -EINVAL; if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9)) return -EIO; if (hdr->dxfer_len) switch (hdr->dxfer_direction) { default: return -EINVAL; case SG_DXFER_TO_DEV: writing = 1; break; case SG_DXFER_TO_FROM_DEV: case SG_DXFER_FROM_DEV: break; } if (hdr->flags & SG_FLAG_Q_AT_HEAD) at_head = 1; ret = -ENOMEM; rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); req = scsi_req(rq); if (hdr->cmd_len > BLK_MAX_CDB) { req->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); if (!req->cmd) goto out_put_request; } ret = blk_fill_sghdr_rq(q, rq, hdr, mode); if (ret < 0) goto out_free_cdb; ret = 0; if (hdr->iovec_count) { struct iov_iter i; struct iovec *iov = NULL; ret = import_iovec(rq_data_dir(rq), hdr->dxferp, hdr->iovec_count, 0, &iov, &i); if (ret < 0) goto out_free_cdb; /* SG_IO howto says that the shorter of the two wins */ iov_iter_truncate(&i, hdr->dxfer_len); ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL); kfree(iov); } else if (hdr->dxfer_len) ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, GFP_KERNEL); if (ret) goto out_free_cdb; bio = rq->bio; req->retries = 0; start_time = jiffies; /* ignore return value. All information is passed back to caller * (if he doesn't check that is his problem). * N.B. a non-zero SCSI status is _not_ necessarily an error. */ blk_execute_rq(q, bd_disk, rq, at_head); hdr->duration = jiffies_to_msecs(jiffies - start_time); ret = blk_complete_sghdr_rq(rq, hdr, bio); out_free_cdb: scsi_req_free_cmd(req); out_put_request: blk_put_request(rq); return ret; } /** * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl * @q: request queue to send scsi commands down * @disk: gendisk to operate on (option) * @mode: mode used to open the file through which the ioctl has been * submitted * @sic: userspace structure describing the command to perform * * Send down the scsi command described by @sic to the device below * the request queue @q. If @file is non-NULL it's used to perform * fine-grained permission checks that allow users to send down * non-destructive SCSI commands. If the caller has a struct gendisk * available it should be passed in as @disk to allow the low level * driver to use the information contained in it. A non-NULL @disk * is only allowed if the caller knows that the low level driver doesn't * need it (e.g. in the scsi subsystem). * * Notes: * - This interface is deprecated - users should use the SG_IO * interface instead, as this is a more flexible approach to * performing SCSI commands on a device. * - The SCSI command length is determined by examining the 1st byte * of the given command. There is no way to override this. * - Data transfers are limited to PAGE_SIZE * - The length (x + y) must be at least OMAX_SB_LEN bytes long to * accommodate the sense buffer when an error occurs. * The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that * old code will not be surprised. * - If a Unix error occurs (e.g. ENOMEM) then the user will receive * a negative return and the Unix error code in 'errno'. * If the SCSI command succeeds then 0 is returned. * Positive numbers returned are the compacted SCSI error codes (4 * bytes in one int) where the lowest byte is the SCSI status. */ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, struct scsi_ioctl_command __user *sic) { enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */ struct request *rq; struct scsi_request *req; int err; unsigned int in_len, out_len, bytes, opcode, cmdlen; char *buffer = NULL; if (!sic) return -EINVAL; /* * get in an out lengths, verify they don't exceed a page worth of data */ if (get_user(in_len, &sic->inlen)) return -EFAULT; if (get_user(out_len, &sic->outlen)) return -EFAULT; if (in_len > PAGE_SIZE || out_len > PAGE_SIZE) return -EINVAL; if (get_user(opcode, sic->data)) return -EFAULT; bytes = max(in_len, out_len); if (bytes) { buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN); if (!buffer) return -ENOMEM; } rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto error_free_buffer; } req = scsi_req(rq); cmdlen = COMMAND_SIZE(opcode); /* * get command and data to send to device, if any */ err = -EFAULT; req->cmd_len = cmdlen; if (copy_from_user(req->cmd, sic->data, cmdlen)) goto error; if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) goto error; err = blk_verify_command(req->cmd, mode); if (err) goto error; /* default. possible overriden later */ req->retries = 5; switch (opcode) { case SEND_DIAGNOSTIC: case FORMAT_UNIT: rq->timeout = FORMAT_UNIT_TIMEOUT; req->retries = 1; break; case START_STOP: rq->timeout = START_STOP_TIMEOUT; break; case MOVE_MEDIUM: rq->timeout = MOVE_MEDIUM_TIMEOUT; break; case READ_ELEMENT_STATUS: rq->timeout = READ_ELEMENT_STATUS_TIMEOUT; break; case READ_DEFECT_DATA: rq->timeout = READ_DEFECT_DATA_TIMEOUT; req->retries = 1; break; default: rq->timeout = BLK_DEFAULT_SG_TIMEOUT; break; } if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO)) { err = DRIVER_ERROR << 24; goto error; } blk_execute_rq(q, disk, rq, 0); err = req->result & 0xff; /* only 8 bit SCSI status */ if (err) { if (req->sense_len && req->sense) { bytes = (OMAX_SB_LEN > req->sense_len) ? req->sense_len : OMAX_SB_LEN; if (copy_to_user(sic->data, req->sense, bytes)) err = -EFAULT; } } else { if (copy_to_user(sic->data, buffer, out_len)) err = -EFAULT; } error: blk_put_request(rq); error_free_buffer: kfree(buffer); return err; } EXPORT_SYMBOL_GPL(sg_scsi_ioctl); /* Send basic block requests */ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, int cmd, int data) { struct request *rq; int err; rq = blk_get_request(q, REQ_OP_SCSI_OUT, 0); if (IS_ERR(rq)) return PTR_ERR(rq); rq->timeout = BLK_DEFAULT_SG_TIMEOUT; scsi_req(rq)->cmd[0] = cmd; scsi_req(rq)->cmd[4] = data; scsi_req(rq)->cmd_len = 6; blk_execute_rq(q, bd_disk, rq, 0); err = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); return err; } static inline int blk_send_start_stop(struct request_queue *q, struct gendisk *bd_disk, int data) { return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data); } int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) { struct compat_sg_io_hdr hdr32 = { .interface_id = hdr->interface_id, .dxfer_direction = hdr->dxfer_direction, .cmd_len = hdr->cmd_len, .mx_sb_len = hdr->mx_sb_len, .iovec_count = hdr->iovec_count, .dxfer_len = hdr->dxfer_len, .dxferp = (uintptr_t)hdr->dxferp, .cmdp = (uintptr_t)hdr->cmdp, .sbp = (uintptr_t)hdr->sbp, .timeout = hdr->timeout, .flags = hdr->flags, .pack_id = hdr->pack_id, .usr_ptr = (uintptr_t)hdr->usr_ptr, .status = hdr->status, .masked_status = hdr->masked_status, .msg_status = hdr->msg_status, .sb_len_wr = hdr->sb_len_wr, .host_status = hdr->host_status, .driver_status = hdr->driver_status, .resid = hdr->resid, .duration = hdr->duration, .info = hdr->info, }; if (copy_to_user(argp, &hdr32, sizeof(hdr32))) return -EFAULT; return 0; } #endif if (copy_to_user(argp, hdr, sizeof(*hdr))) return -EFAULT; return 0; } EXPORT_SYMBOL(put_sg_io_hdr); int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp) { #ifdef CONFIG_COMPAT struct compat_sg_io_hdr hdr32; if (in_compat_syscall()) { if (copy_from_user(&hdr32, argp, sizeof(hdr32))) return -EFAULT; *hdr = (struct sg_io_hdr) { .interface_id = hdr32.interface_id, .dxfer_direction = hdr32.dxfer_direction, .cmd_len = hdr32.cmd_len, .mx_sb_len = hdr32.mx_sb_len, .iovec_count = hdr32.iovec_count, .dxfer_len = hdr32.dxfer_len, .dxferp = compat_ptr(hdr32.dxferp), .cmdp = compat_ptr(hdr32.cmdp), .sbp = compat_ptr(hdr32.sbp), .timeout = hdr32.timeout, .flags = hdr32.flags, .pack_id = hdr32.pack_id, .usr_ptr = compat_ptr(hdr32.usr_ptr), .status = hdr32.status, .masked_status = hdr32.masked_status, .msg_status = hdr32.msg_status, .sb_len_wr = hdr32.sb_len_wr, .host_status = hdr32.host_status, .driver_status = hdr32.driver_status, .resid = hdr32.resid, .duration = hdr32.duration, .info = hdr32.info, }; return 0; } #endif if (copy_from_user(hdr, argp, sizeof(*hdr))) return -EFAULT; return 0; } EXPORT_SYMBOL(get_sg_io_hdr); #ifdef CONFIG_COMPAT struct compat_cdrom_generic_command { unsigned char cmd[CDROM_PACKET_SIZE]; compat_caddr_t buffer; compat_uint_t buflen; compat_int_t stat; compat_caddr_t sense; unsigned char data_direction; unsigned char pad[3]; compat_int_t quiet; compat_int_t timeout; compat_caddr_t unused; }; #endif static int scsi_get_cdrom_generic_arg(struct cdrom_generic_command *cgc, const void __user *arg) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) { struct compat_cdrom_generic_command cgc32; if (copy_from_user(&cgc32, arg, sizeof(cgc32))) return -EFAULT; *cgc = (struct cdrom_generic_command) { .buffer = compat_ptr(cgc32.buffer), .buflen = cgc32.buflen, .stat = cgc32.stat, .sense = compat_ptr(cgc32.sense), .data_direction = cgc32.data_direction, .quiet = cgc32.quiet, .timeout = cgc32.timeout, .unused = compat_ptr(cgc32.unused), }; memcpy(&cgc->cmd, &cgc32.cmd, CDROM_PACKET_SIZE); return 0; } #endif if (copy_from_user(cgc, arg, sizeof(*cgc))) return -EFAULT; return 0; } static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc, void __user *arg) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) { struct compat_cdrom_generic_command cgc32 = { .buffer = (uintptr_t)(cgc->buffer), .buflen = cgc->buflen, .stat = cgc->stat, .sense = (uintptr_t)(cgc->sense), .data_direction = cgc->data_direction, .quiet = cgc->quiet, .timeout = cgc->timeout, .unused = (uintptr_t)(cgc->unused), }; memcpy(&cgc32.cmd, &cgc->cmd, CDROM_PACKET_SIZE); if (copy_to_user(arg, &cgc32, sizeof(cgc32))) return -EFAULT; return 0; } #endif if (copy_to_user(arg, cgc, sizeof(*cgc))) return -EFAULT; return 0; } static int scsi_cdrom_send_packet(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode, void __user *arg) { struct cdrom_generic_command cgc; struct sg_io_hdr hdr; int err; err = scsi_get_cdrom_generic_arg(&cgc, arg); if (err) return err; cgc.timeout = clock_t_to_jiffies(cgc.timeout); memset(&hdr, 0, sizeof(hdr)); hdr.interface_id = 'S'; hdr.cmd_len = sizeof(cgc.cmd); hdr.dxfer_len = cgc.buflen; switch (cgc.data_direction) { case CGC_DATA_UNKNOWN: hdr.dxfer_direction = SG_DXFER_UNKNOWN; break; case CGC_DATA_WRITE: hdr.dxfer_direction = SG_DXFER_TO_DEV; break; case CGC_DATA_READ: hdr.dxfer_direction = SG_DXFER_FROM_DEV; break; case CGC_DATA_NONE: hdr.dxfer_direction = SG_DXFER_NONE; break; default: return -EINVAL; } hdr.dxferp = cgc.buffer; hdr.sbp = cgc.sense; if (hdr.sbp) hdr.mx_sb_len = sizeof(struct request_sense); hdr.timeout = jiffies_to_msecs(cgc.timeout); hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd; hdr.cmd_len = sizeof(cgc.cmd); err = sg_io(q, bd_disk, &hdr, mode); if (err == -EFAULT) return -EFAULT; if (hdr.status) return -EIO; cgc.stat = err; cgc.buflen = hdr.resid; if (scsi_put_cdrom_generic_arg(&cgc, arg)) return -EFAULT; return err; } int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode, unsigned int cmd, void __user *arg) { int err; if (!q) return -ENXIO; switch (cmd) { /* * new sgv3 interface */ case SG_GET_VERSION_NUM: err = sg_get_version(arg); break; case SCSI_IOCTL_GET_IDLUN: err = scsi_get_idlun(q, arg); break; case SCSI_IOCTL_GET_BUS_NUMBER: err = scsi_get_bus(q, arg); break; case SG_SET_TIMEOUT: err = sg_set_timeout(q, arg); break; case SG_GET_TIMEOUT: err = sg_get_timeout(q); break; case SG_GET_RESERVED_SIZE: err = sg_get_reserved_size(q, arg); break; case SG_SET_RESERVED_SIZE: err = sg_set_reserved_size(q, arg); break; case SG_EMULATED_HOST: err = sg_emulated_host(q, arg); break; case SG_IO: { struct sg_io_hdr hdr; err = get_sg_io_hdr(&hdr, arg); if (err) break; err = sg_io(q, bd_disk, &hdr, mode); if (err == -EFAULT) break; if (put_sg_io_hdr(&hdr, arg)) err = -EFAULT; break; } case CDROM_SEND_PACKET: err = scsi_cdrom_send_packet(q, bd_disk, mode, arg); break; /* * old junk scsi send command ioctl */ case SCSI_IOCTL_SEND_COMMAND: printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm); err = -EINVAL; if (!arg) break; err = sg_scsi_ioctl(q, bd_disk, mode, arg); break; case CDROMCLOSETRAY: err = blk_send_start_stop(q, bd_disk, 0x03); break; case CDROMEJECT: err = blk_send_start_stop(q, bd_disk, 0x02); break; default: err = -ENOTTY; } return err; } EXPORT_SYMBOL(scsi_cmd_ioctl); int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) { if (bd && !bdev_is_partition(bd)) return 0; if (capable(CAP_SYS_RAWIO)) return 0; return -ENOIOCTLCMD; } EXPORT_SYMBOL(scsi_verify_blk_ioctl); int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, unsigned int cmd, void __user *arg) { int ret; ret = scsi_verify_blk_ioctl(bd, cmd); if (ret < 0) return ret; return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg); } EXPORT_SYMBOL(scsi_cmd_blk_ioctl); /** * scsi_req_init - initialize certain fields of a scsi_request structure * @req: Pointer to a scsi_request structure. * Initializes .__cmd[], .cmd, .cmd_len and .sense_len but no other members * of struct scsi_request. */ void scsi_req_init(struct scsi_request *req) { memset(req->__cmd, 0, sizeof(req->__cmd)); req->cmd = req->__cmd; req->cmd_len = BLK_MAX_CDB; req->sense_len = 0; } EXPORT_SYMBOL(scsi_req_init); static int __init blk_scsi_ioctl_init(void) { blk_set_cmd_filter_defaults(&blk_default_cmd_filter); return 0; } fs_initcall(blk_scsi_ioctl_init);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KERNEL_STAT_H #define _LINUX_KERNEL_STAT_H #include <linux/smp.h> #include <linux/threads.h> #include <linux/percpu.h> #include <linux/cpumask.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/vtime.h> #include <asm/irq.h> /* * 'kernel_stat.h' contains the definitions needed for doing * some kernel statistics (CPU usage, context switches ...), * used by rstatd/perfmeter */ enum cpu_usage_stat { CPUTIME_USER, CPUTIME_NICE, CPUTIME_SYSTEM, CPUTIME_SOFTIRQ, CPUTIME_IRQ, CPUTIME_IDLE, CPUTIME_IOWAIT, CPUTIME_STEAL, CPUTIME_GUEST, CPUTIME_GUEST_NICE, NR_STATS, }; struct kernel_cpustat { u64 cpustat[NR_STATS]; }; struct kernel_stat { unsigned long irqs_sum; unsigned int softirqs[NR_SOFTIRQS]; }; DECLARE_PER_CPU(struct kernel_stat, kstat); DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat); /* Must have preemption disabled for this to be meaningful. */ #define kstat_this_cpu this_cpu_ptr(&kstat) #define kcpustat_this_cpu this_cpu_ptr(&kernel_cpustat) #define kstat_cpu(cpu) per_cpu(kstat, cpu) #define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu) extern unsigned long long nr_context_switches(void); extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); extern void kstat_incr_irq_this_cpu(unsigned int irq); static inline void kstat_incr_softirqs_this_cpu(unsigned int irq) { __this_cpu_inc(kstat.softirqs[irq]); } static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) { return kstat_cpu(cpu).softirqs[irq]; } /* * Number of interrupts per specific IRQ source, since bootup */ extern unsigned int kstat_irqs(unsigned int irq); extern unsigned int kstat_irqs_usr(unsigned int irq); /* * Number of interrupts per cpu, since bootup */ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) { return kstat_cpu(cpu).irqs_sum; } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern u64 kcpustat_field(struct kernel_cpustat *kcpustat, enum cpu_usage_stat usage, int cpu); extern void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu); #else static inline u64 kcpustat_field(struct kernel_cpustat *kcpustat, enum cpu_usage_stat usage, int cpu) { return kcpustat->cpustat[usage]; } static inline void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) { *dst = kcpustat_cpu(cpu); } #endif extern void account_user_time(struct task_struct *, u64); extern void account_guest_time(struct task_struct *, u64); extern void account_system_time(struct task_struct *, int, u64); extern void account_system_index_time(struct task_struct *, u64, enum cpu_usage_stat); extern void account_steal_time(u64); extern void account_idle_time(u64); extern u64 get_idle_time(struct kernel_cpustat *kcs, int cpu); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline void account_process_tick(struct task_struct *tsk, int user) { vtime_flush(tsk); } #else extern void account_process_tick(struct task_struct *, int user); #endif extern void account_idle_ticks(unsigned long ticks); #endif /* _LINUX_KERNEL_STAT_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM msr #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE msr-trace #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH asm/ #if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MSR_H #include <linux/tracepoint.h> /* * Tracing for x86 model specific registers. Directly maps to the * RDMSR/WRMSR instructions. */ DECLARE_EVENT_CLASS(msr_trace_class, TP_PROTO(unsigned msr, u64 val, int failed), TP_ARGS(msr, val, failed), TP_STRUCT__entry( __field( unsigned, msr ) __field( u64, val ) __field( int, failed ) ), TP_fast_assign( __entry->msr = msr; __entry->val = val; __entry->failed = failed; ), TP_printk("%x, value %llx%s", __entry->msr, __entry->val, __entry->failed ? " #GP" : "") ); DEFINE_EVENT(msr_trace_class, read_msr, TP_PROTO(unsigned msr, u64 val, int failed), TP_ARGS(msr, val, failed) ); DEFINE_EVENT(msr_trace_class, write_msr, TP_PROTO(unsigned msr, u64 val, int failed), TP_ARGS(msr, val, failed) ); DEFINE_EVENT(msr_trace_class, rdpmc, TP_PROTO(unsigned msr, u64 val, int failed), TP_ARGS(msr, val, failed) ); #endif /* _TRACE_MSR_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/file_table.c * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> #include <linux/rcupdate.h> #include <linux/mount.h> #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/task_work.h> #include <linux/ima.h> #include <linux/swap.h> #include <linux/atomic.h> #include "internal.h" /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE }; /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; static void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_u.fu_rcuhead); put_cred(f->f_cred); kmem_cache_free(filp_cachep, f); } static inline void file_free(struct file *f) { security_file_free(f); if (!(f->f_mode & FMODE_NOACCOUNT)) percpu_counter_dec(&nr_files); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } /* * Return the total number of open files in the system */ static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } /* * Return the maximum number of open files in the system */ unsigned long get_max_files(void) { return files_stat.max_files; } EXPORT_SYMBOL_GPL(get_max_files); /* * Handle nr_files sysctl */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) int proc_nr_files(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } #endif static struct file *__alloc_file(int flags, const struct cred *cred) { struct file *f; int error; f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { file_free_rcu(&f->f_u.fu_rcuhead); return ERR_PTR(error); } atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); eventpoll_init_file(f); f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ return f; } /* Find an unused file structure and return a pointer to it. * Returns an error pointer if some error happend e.g. we over file * structures limit, run out of memory or operation is not permitted. * * Be very careful using this. You are responsible for * getting write access to any mount that you might assign * to this filp, if it is opened for write. If this is not * done, you will imbalance int the mount's writer count * and a warning at __fput() time. */ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; /* * Privileged users can go above max_files */ if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) goto over; } f = __alloc_file(flags, cred); if (!IS_ERR(f)) percpu_counter_inc(&nr_files); return f; over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } return ERR_PTR(-ENFILE); } /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * * Should not be used unless there's a very good reason to do so. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { struct file *f = __alloc_file(flags, cred); if (!IS_ERR(f)) f->f_mode |= FMODE_NOACCOUNT; return f; } /** * alloc_file - allocate and initialize a 'struct file' * * @path: the (dentry, vfsmount) pair for the new file * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ static struct file *alloc_file(const struct path *path, int flags, const struct file_operations *fop) { struct file *file; file = alloc_empty_file(flags, current_cred()); if (IS_ERR(file)) return file; file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); file->f_sb_err = file_sample_sb_err(file); if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); return file; } struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; struct qstr this = QSTR_INIT(name, strlen(name)); struct path path; struct file *file; path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); if (!path.dentry) return ERR_PTR(-ENOMEM); if (!mnt->mnt_sb->s_d_op) d_set_d_op(path.dentry, &anon_ops); path.mnt = mntget(mnt); d_instantiate(path.dentry, inode); file = alloc_file(&path, flags, fops); if (IS_ERR(file)) { ihold(inode); path_put(&path); } return file; } EXPORT_SYMBOL(alloc_file_pseudo); struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { struct file *f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; } return f; } /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; fmode_t mode = file->f_mode; if (unlikely(!(file->f_mode & FMODE_OPENED))) goto out; might_sleep(); fsnotify_close(file); /* * The function eventpoll_release() should be the first called * in the file cleanup chain. */ eventpoll_release(file); locks_remove_file(file); ima_file_free(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); } fops_put(file->f_op); put_pid(file->f_owner.pid); if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); if (mode & FMODE_WRITER) { put_write_access(inode); __mnt_drop_write(mnt); } dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); mntput(mnt); out: file_free(file); } static LLIST_HEAD(delayed_fput_list); static void delayed_fput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_fput_list); struct file *f, *t; llist_for_each_entry_safe(f, t, node, f_u.fu_llist) __fput(f); } static void ____fput(struct callback_head *work) { __fput(container_of(work, struct file, f_u.fu_rcuhead)); } /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we * *do* need to make sure our writes to binaries on initramfs has * not left us with opened struct file waiting for __fput() - execve() * won't work without that. Please, don't add more callers without * very good reasons; in particular, never call that with locks * held and never call that from a thread that might need to do * some work on any kind of umount. */ void flush_delayed_fput(void) { delayed_fput(NULL); } EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); void fput_many(struct file *file, unsigned int refs) { if (atomic_long_sub_and_test(refs, &file->f_count)) { struct task_struct *task = current; if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_u.fu_rcuhead, ____fput); if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)) return; /* * After this task has run exit_task_work(), * task_work_add() will fail. Fall through to delayed * fput to avoid leaking *file. */ } if (llist_add(&file->f_u.fu_llist, &delayed_fput_list)) schedule_delayed_work(&delayed_fput_work, 1); } } void fput(struct file *file) { fput_many(file, 1); } /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without * risking deadlocks), need to wait for completion of __fput() and know * for this specific struct file it won't involve anything that would * need them. Use only if you really need it - at the very least, * don't blindly convert fput() by kernel thread to that. */ void __fput_sync(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; BUG_ON(!(task->flags & PF_KTHREAD)); __fput(file); } } EXPORT_SYMBOL(fput); void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } /* * One file with associated inode and dcache is very roughly 1K. Per default * do not use more than 10% of our memory for files. */ void __init files_maxfiles_init(void) { unsigned long n; unsigned long nr_pages = totalram_pages(); unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; memreserve = min(memreserve, nr_pages - 1); n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); }
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* include/asm-generic/tlb.h * * Generic TLB shootdown code * * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * * Copyright 2011 Red Hat, Inc., Peter Zijlstra */ #ifndef _ASM_GENERIC__TLB_H #define _ASM_GENERIC__TLB_H #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/hugetlb_inline.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> /* * Blindly accessing user memory from NMI context can be dangerous * if we're in the middle of switching the current user task or switching * the loaded mm. */ #ifndef nmi_uaccess_okay # define nmi_uaccess_okay() true #endif #ifdef CONFIG_MMU /* * Generic MMU-gather implementation. * * The mmu_gather data structure is used by the mm code to implement the * correct and efficient ordering of freeing pages and TLB invalidations. * * This correct ordering is: * * 1) unhook page * 2) TLB invalidate page * 3) free page * * That is, we must never free a page before we have ensured there are no live * translations left to it. Otherwise it might be possible to observe (or * worse, change) the page content after it has been reused. * * The mmu_gather API consists of: * * - tlb_gather_mmu() / tlb_finish_mmu(); start and finish a mmu_gather * * Finish in particular will issue a (final) TLB invalidate and free * all (remaining) queued pages. * * - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA * * Defaults to flushing at tlb_end_vma() to reset the range; helps when * there's large holes between the VMAs. * * - tlb_remove_table() * * tlb_remove_table() is the basic primitive to free page-table directories * (__p*_free_tlb()). In it's most primitive form it is an alias for * tlb_remove_page() below, for when page directories are pages and have no * additional constraints. * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * * - tlb_remove_page() / __tlb_remove_page() * - tlb_remove_page_size() / __tlb_remove_page_size() * * __tlb_remove_page_size() is the basic primitive that queues a page for * freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a * boolean indicating if the queue is (now) full and a call to * tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * * - tlb_change_page_size() * * call before __tlb_remove_page*() to set the current page-size; implies a * possible tlb_flush_mmu() call. * * - tlb_flush_mmu() / tlb_flush_mmu_tlbonly() * * tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets * related state, like the range) * * tlb_flush_mmu() - in addition to the above TLB invalidate, also frees * whatever pages are still batched. * * - mmu_gather::fullmm * * A flag set by tlb_gather_mmu() to indicate we're going to free * the entire mm; this allows a number of optimizations. * * - We can ignore tlb_{start,end}_vma(); because we don't * care about ranges. Everything will be shot down. * * - (RISC) architectures that use ASIDs can cycle to a new ASID * and delay the invalidation until ASID space runs out. * * - mmu_gather::need_flush_all * * A flag that can be set by the arch code if it wants to force * flush the entire TLB irrespective of the range. For instance * x86-PAE needs this when changing top-level entries. * * And allows the architecture to provide and implement tlb_flush(): * * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make * use of: * * - mmu_gather::start / mmu_gather::end * * which provides the range that needs to be flushed to cover the pages to * be freed. * * - mmu_gather::freed_tables * * set when we freed page table pages * * - tlb_get_unmap_shift() / tlb_get_unmap_size() * * returns the smallest TLB entry size unmapped in this range. * * If an architecture does not provide tlb_flush() a default implementation * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is * specified, in which case we'll default to flush_tlb_mm(). * * Additionally there are a few opt-in features: * * MMU_GATHER_PAGE_SIZE * * This ensures we call tlb_flush() every time tlb_change_page_size() actually * changes the size and provides mmu_gather::page_size to tlb_flush(). * * This might be useful if your architecture has size specific TLB * invalidation instructions. * * MMU_GATHER_TABLE_FREE * * This provides tlb_remove_table(), to be used instead of tlb_remove_page() * for page directores (__p*_free_tlb()). * * Useful if your architecture has non-page page directories. * * When used, an architecture is expected to provide __tlb_remove_table() * which does the actual freeing of these pages. * * MMU_GATHER_RCU_TABLE_FREE * * Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see * comment below). * * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * * MMU_GATHER_NO_RANGE * * Use this if your architecture lacks an efficient flush_tlb_range(). * * MMU_GATHER_NO_GATHER * * If the option is set the mmu_gather will not track individual pages for * delayed page free anymore. A platform that enables the option needs to * provide its own implementation of the __tlb_remove_page_size() function to * free pages. * * This is useful if your architecture already flushes TLB entries in the * various ptep_get_and_clear() functions. */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch { #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE struct rcu_head rcu; #endif unsigned int nr; void *tables[0]; }; #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */ /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ #define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page)) #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * This allows an architecture that does not use the linux page-tables for * hardware to skip the TLBI when freeing page tables. */ #ifndef tlb_needs_table_invalidate #define tlb_needs_table_invalidate() (true) #endif #else #ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ #ifndef CONFIG_MMU_GATHER_NO_GATHER /* * If we can't allocate a page to make a big batch of page pointers * to work on, then just handle a few from the on-stack structure. */ #define MMU_GATHER_BUNDLE 8 struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; struct page *pages[0]; }; #define MAX_GATHER_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *)) /* * Limit the maximum number of mmu_gather batches to reduce a risk of soft * lockups for non-preemptible kernels on huge machines when a lot of memory * is zapped during unmapping. * 10K pages freed at once should be safe even without a preemption point. */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); #endif /* * struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { struct mm_struct *mm; #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch *batch; #endif unsigned long start; unsigned long end; /* * we are in the middle of an operation to clear * a full mm and can make some optimizations */ unsigned int fullmm : 1; /* * we have performed an operation which * requires a complete flush of the tlb */ unsigned int need_flush_all : 1; /* * we have removed page directories */ unsigned int freed_tables : 1; /* * at which levels have we cleared entries? */ unsigned int cleared_ptes : 1; unsigned int cleared_pmds : 1; unsigned int cleared_puds : 1; unsigned int cleared_p4ds : 1; /* * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma */ unsigned int vma_exec : 1; unsigned int vma_huge : 1; unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER struct mmu_gather_batch *active; struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; #ifdef CONFIG_MMU_GATHER_PAGE_SIZE unsigned int page_size; #endif #endif }; void tlb_flush_mmu(struct mmu_gather *tlb); static inline void __tlb_adjust_range(struct mmu_gather *tlb, unsigned long address, unsigned int range_size) { tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + range_size); } static inline void __tlb_reset_range(struct mmu_gather *tlb) { if (tlb->fullmm) { tlb->start = tlb->end = ~0; } else { tlb->start = TASK_SIZE; tlb->end = 0; } tlb->freed_tables = 0; tlb->cleared_ptes = 0; tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an * intermediate flush. */ } #ifdef CONFIG_MMU_GATHER_NO_RANGE #if defined(tlb_flush) || defined(tlb_start_vma) || defined(tlb_end_vma) #error MMU_GATHER_NO_RANGE relies on default tlb_flush(), tlb_start_vma() and tlb_end_vma() #endif /* * When an architecture does not have efficient means of range flushing TLBs * there is no point in doing intermediate flushes on tlb_end_vma() to keep the * range small. We equally don't have to worry about page granularity or other * things. * * All we need to do is issue a full flush for any !0 range. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->end) flush_tlb_mm(tlb->mm); } static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #define tlb_end_vma tlb_end_vma static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #else /* CONFIG_MMU_GATHER_NO_RANGE */ #ifndef tlb_flush #if defined(tlb_start_vma) || defined(tlb_end_vma) #error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma() #endif /* * When an architecture does not provide its own tlb_flush() implementation * but does have a reasonably efficient flush_vma_range() implementation * use that. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->fullmm || tlb->need_flush_all) { flush_tlb_mm(tlb->mm); } else if (tlb->end) { struct vm_area_struct vma = { .vm_mm = tlb->mm, .vm_flags = (tlb->vma_exec ? VM_EXEC : 0) | (tlb->vma_huge ? VM_HUGETLB : 0), }; flush_tlb_range(&vma, tlb->start, tlb->end); } } static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { /* * flush_tlb_range() implementations that look at VM_HUGETLB (tile, * mips-4k) flush only large pages. * * flush_tlb_range() implementations that flush I-TLB also flush D-TLB * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing * range. * * We rely on tlb_end_vma() to issue a flush, such that when we reset * these values the batch is empty. */ tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); } #else static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #endif #endif /* CONFIG_MMU_GATHER_NO_RANGE */ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { /* * Anything calling __tlb_adjust_range() also sets at least one of * these bits. */ if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || tlb->cleared_puds || tlb->cleared_p4ds)) return; tlb_flush(tlb); mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); __tlb_reset_range(tlb); } static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { if (__tlb_remove_page_size(tlb, page, page_size)) tlb_flush_mmu(tlb); } static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return __tlb_remove_page_size(tlb, page, PAGE_SIZE); } /* tlb_remove_page * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when * required. */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return tlb_remove_page_size(tlb, page, PAGE_SIZE); } static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { #ifdef CONFIG_MMU_GATHER_PAGE_SIZE if (tlb->page_size && tlb->page_size != page_size) { if (!tlb->fullmm && !tlb->need_flush_all) tlb_flush_mmu(tlb); } tlb->page_size = page_size; #endif } static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb) { if (tlb->cleared_ptes) return PAGE_SHIFT; if (tlb->cleared_pmds) return PMD_SHIFT; if (tlb->cleared_puds) return PUD_SHIFT; if (tlb->cleared_p4ds) return P4D_SHIFT; return PAGE_SHIFT; } static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb) { return 1UL << tlb_get_unmap_shift(tlb); } /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, * the vmas are adjusted to only cover the region to be torn down. */ #ifndef tlb_start_vma static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) return; tlb_update_vma_flags(tlb, vma); flush_cache_range(vma, vma->vm_start, vma->vm_end); } #endif #ifndef tlb_end_vma static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) return; /* * Do a TLB flush and reset the range at VMA boundaries; this avoids * the ranges growing with the unused space between consecutive VMAs, * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on * this. */ tlb_flush_mmu_tlbonly(tlb); } #endif /* * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, * and set corresponding cleared_*. */ static inline void tlb_flush_pte_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_ptes = 1; } static inline void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_pmds = 1; } static inline void tlb_flush_pud_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_puds = 1; } static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_p4ds = 1; } #ifndef __tlb_remove_tlb_entry #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) #endif /** * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. * * Record the fact that pte's were really unmapped by updating the range, * so we can later optimise away the tlb invalidate. This helps when * userspace is unmapping already-unmapped pages, which happens quite a lot. */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ tlb_flush_pte_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ if (_sz == PMD_SIZE) \ tlb_flush_pmd_range(tlb, address, _sz); \ else if (_sz == PUD_SIZE) \ tlb_flush_pud_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation * This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pmd_tlb_entry #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) #endif #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ do { \ tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE); \ __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) /** * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb * invalidation. This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pud_tlb_entry #define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0) #endif #define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ do { \ tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE); \ __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ } while (0) /* * For things like page tables caches (ie caching addresses "inside" the * page tables, like x86 does), for legacy reasons, flushing an * individual page had better flush the page table caches behind it. This * is definitely how x86 works, for example. And if you have an * architected non-legacy page table cache (which I'm not aware of * anybody actually doing), you're going to have some architecturally * explicit flushing for that, likely *separate* from a regular TLB entry * flush, and thus you'd need more than just some range expansion.. * * So if we ever find an architecture * that would want something that odd, I think it is up to that * architecture to do its own odd thing, not cause pain for others * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com * * For now w.r.t page table cache, mark the range_size as PAGE_SIZE */ #ifndef pte_free_tlb #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb_flush_pmd_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #endif #ifndef pmd_free_tlb #define pmd_free_tlb(tlb, pmdp, address) \ do { \ tlb_flush_pud_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #endif #ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ tlb_flush_p4d_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef p4d_free_tlb #define p4d_free_tlb(tlb, pudp, address) \ do { \ __tlb_adjust_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __p4d_free_tlb(tlb, pudp, address); \ } while (0) #endif #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 /* SPDX-License-Identifier: GPL-2.0-only */ /* * V9FS definitions. * * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #ifndef FS_9P_V9FS_H #define FS_9P_V9FS_H #include <linux/backing-dev.h> /** * enum p9_session_flags - option flags for each 9P session * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client. * @V9FS_ACCESS_ANY: use a single attach for all users * @V9FS_ACCESS_MASK: bit mask of different ACCESS options * @V9FS_POSIX_ACL: POSIX ACLs are enforced * * Session flags reflect options selected by users at mount time */ #define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \ V9FS_ACCESS_USER | \ V9FS_ACCESS_CLIENT) #define V9FS_ACCESS_MASK V9FS_ACCESS_ANY #define V9FS_ACL_MASK V9FS_POSIX_ACL enum p9_session_flags { V9FS_PROTO_2000U = 0x01, V9FS_PROTO_2000L = 0x02, V9FS_ACCESS_SINGLE = 0x04, V9FS_ACCESS_USER = 0x08, V9FS_ACCESS_CLIENT = 0x10, V9FS_POSIX_ACL = 0x20 }; /* possible values of ->cache */ /** * enum p9_cache_modes - user specified cache preferences * @CACHE_NONE: do not cache data, dentries, or directory contents (default) * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency * * eventually support loose, tight, time, session, default always none */ enum p9_cache_modes { CACHE_NONE, CACHE_MMAP, CACHE_LOOSE, CACHE_FSCACHE, nr__p9_cache_modes }; /** * struct v9fs_session_info - per-instance session information * @flags: session options of type &p9_session_flags * @nodev: set to 1 to disable device mapping * @debug: debug level * @afid: authentication handle * @cache: cache mode of type &p9_cache_modes * @cachetag: the tag of the cache associated with this session * @fscache: session cookie associated with FS-Cache * @uname: string user name to mount hierarchy as * @aname: mount specifier for remote hierarchy * @maxdata: maximum data to be sent/recvd per protocol message * @dfltuid: default numeric userid to mount hierarchy as * @dfltgid: default numeric groupid to mount hierarchy as * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy * @clnt: reference to 9P network client instantiated for this session * @slist: reference to list of registered 9p sessions * * This structure holds state for each session instance established during * a sys_mount() . * * Bugs: there seems to be a lot of state which could be condensed and/or * removed. */ struct v9fs_session_info { /* options */ unsigned char flags; unsigned char nodev; unsigned short debug; unsigned int afid; unsigned int cache; #ifdef CONFIG_9P_FSCACHE char *cachetag; struct fscache_cookie *fscache; #endif char *uname; /* user name to mount as */ char *aname; /* name of remote hierarchy being mounted */ unsigned int maxdata; /* max data for client interface */ kuid_t dfltuid; /* default uid/muid for legacy support */ kgid_t dfltgid; /* default gid for legacy support */ kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct rw_semaphore rename_sem; long session_lock_timeout; /* retry interval for blocking locks */ }; /* cache_validity flags */ #define V9FS_INO_INVALID_ATTR 0x01 struct v9fs_inode { #ifdef CONFIG_9P_FSCACHE struct mutex fscache_lock; struct fscache_cookie *fscache; #endif struct p9_qid qid; unsigned int cache_validity; struct p9_fid *writeback_fid; struct mutex v_mutex; struct inode vfs_inode; }; static inline struct v9fs_inode *V9FS_I(const struct inode *inode) { return container_of(inode, struct v9fs_inode, vfs_inode); } extern int v9fs_show_options(struct seq_file *m, struct dentry *root); struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, char *); extern void v9fs_session_close(struct v9fs_session_info *v9ses); extern void v9fs_session_cancel(struct v9fs_session_info *v9ses); extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); /* other default globals */ #define V9FS_PORT 564 #define V9FS_DEFUSER "nobody" #define V9FS_DEFANAME "" #define V9FS_DEFUID KUIDT_INIT(-2) #define V9FS_DEFGID KGIDT_INIT(-2) static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) { return (inode->i_sb->s_fs_info); } static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry) { return dentry->d_sb->s_fs_info; } static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses) { return v9ses->flags & V9FS_PROTO_2000U; } static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) { return v9ses->flags & V9FS_PROTO_2000L; } /** * v9fs_get_inode_from_fid - Helper routine to populate an inode by * issuing a attribute request * @v9ses: session information * @fid: fid to issue attribute request for * @sb: superblock on which to create inode * */ static inline struct inode * v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0); else return v9fs_inode_from_fid(v9ses, fid, sb, 0); } /** * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by * issuing a attribute request * @v9ses: session information * @fid: fid to issue attribute request for * @sb: superblock on which to create inode * */ static inline struct inode * v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1); else return v9fs_inode_from_fid(v9ses, fid, sb, 1); } #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_PGTABLE_INVERT_H #define _ASM_PGTABLE_INVERT_H 1 #ifndef __ASSEMBLY__ /* * A clear pte value is special, and doesn't get inverted. * * Note that even users that only pass a pgprot_t (rather * than a full pte) won't trigger the special zero case, * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED * set. So the all zero case really is limited to just the * cleared page table entry case. */ static inline bool __pte_needs_invert(u64 val) { return val && !(val & _PAGE_PRESENT); } /* Get a mask to xor with the page table entry to get the correct pfn. */ static inline u64 protnone_mask(u64 val) { return __pte_needs_invert(val) ? ~0ull : 0; } static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) { /* * When a PTE transitions from NONE to !NONE or vice-versa * invert the PFN part to stop speculation. * pte_pfn undoes this when needed. */ if (__pte_needs_invert(oldval) != __pte_needs_invert(val)) val = (val & ~mask) | (~val & mask); return val; } #endif /* __ASSEMBLY__ */ #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ #include <linux/rhashtable-types.h> #include <linux/completion.h> /* Per netns frag queues directory */ struct fqdir { /* sysctls */ long high_thresh; long low_thresh; int timeout; int max_dist; struct inet_frags *f; struct net *net; bool dead; struct rhashtable rhashtable ____cacheline_aligned_in_smp; /* Keep atomic mem on separate cachelines in structs that include it */ atomic_long_t mem ____cacheline_aligned_in_smp; struct work_struct destroy_work; }; /** * fragment queue flags * * @INET_FRAG_FIRST_IN: first fragment has arrived * @INET_FRAG_LAST_IN: final fragment has arrived * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable */ enum { INET_FRAG_FIRST_IN = BIT(0), INET_FRAG_LAST_IN = BIT(1), INET_FRAG_COMPLETE = BIT(2), INET_FRAG_HASH_DEAD = BIT(3), }; struct frag_v4_compare_key { __be32 saddr; __be32 daddr; u32 user; u32 vif; __be16 id; u16 protocol; }; struct frag_v6_compare_key { struct in6_addr saddr; struct in6_addr daddr; u32 user; __be32 id; u32 iif; }; /** * struct inet_frag_queue - fragment queue * * @node: rhash node * @key: keys identifying this frag. * @timer: queue expiration timer * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @rb_fragments: received fragments rb-tree root * @fragments_tail: received fragments tail * @last_run_head: the head of the last "run". see ip_fragment.c * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far * @flags: fragment queue flags * @max_size: maximum received fragment size * @fqdir: pointer to struct fqdir * @rcu: rcu head for freeing deferall */ struct inet_frag_queue { struct rhash_head node; union { struct frag_v4_compare_key v4; struct frag_v6_compare_key v6; } key; struct timer_list timer; spinlock_t lock; refcount_t refcnt; struct rb_root rb_fragments; struct sk_buff *fragments_tail; struct sk_buff *last_run_head; ktime_t stamp; int len; int meat; __u8 flags; u16 max_size; struct fqdir *fqdir; struct rcu_head rcu; }; struct inet_frags { unsigned int qsize; void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; struct rhashtable_params rhash_params; refcount_t refcnt; struct completion completion; }; int inet_frags_init(struct inet_frags *); void inet_frags_fini(struct inet_frags *); int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net); static inline void fqdir_pre_exit(struct fqdir *fqdir) { fqdir->high_thresh = 0; /* prevent creation of new frags */ fqdir->dead = true; } void fqdir_exit(struct fqdir *fqdir); void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); /* Free all skbs in the queue; return the sum of their truesizes. */ unsigned int inet_frag_rbtree_purge(struct rb_root *root); static inline void inet_frag_put(struct inet_frag_queue *q) { if (refcount_dec_and_test(&q->refcnt)) inet_frag_destroy(q); } /* Memory Tracking Functions. */ static inline long frag_mem_limit(const struct fqdir *fqdir) { return atomic_long_read(&fqdir->mem); } static inline void sub_frag_mem_limit(struct fqdir *fqdir, long val) { atomic_long_sub(val, &fqdir->mem); } static inline void add_frag_mem_limit(struct fqdir *fqdir, long val) { atomic_long_add(val, &fqdir->mem); } /* RFC 3168 support : * We want to check ECN values of all fragments, do detect invalid combinations. * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. */ #define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ #define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ #define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ #define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ extern const u8 ip_frag_ecn_table[16]; /* Return values of inet_frag_queue_insert() */ #define IPFRAG_OK 0 #define IPFRAG_DUP 1 #define IPFRAG_OVERLAP 2 int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, int offset, int end); void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce); struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q); #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GFP_H #define __LINUX_GFP_H #include <linux/mmdebug.h> #include <linux/mmzone.h> #include <linux/stddef.h> #include <linux/linkage.h> #include <linux/topology.h> struct vm_area_struct; /* * In case of changes, please don't forget to update * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c */ /* Plain integer GFP bitmasks. Do not use this directly. */ #define ___GFP_DMA 0x01u #define ___GFP_HIGHMEM 0x02u #define ___GFP_DMA32 0x04u #define ___GFP_MOVABLE 0x08u #define ___GFP_RECLAIMABLE 0x10u #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u #define ___GFP_ATOMIC 0x200u #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u #define ___GFP_NOWARN 0x2000u #define ___GFP_RETRY_MAYFAIL 0x4000u #define ___GFP_NOFAIL 0x8000u #define ___GFP_NORETRY 0x10000u #define ___GFP_MEMALLOC 0x20000u #define ___GFP_COMP 0x40000u #define ___GFP_NOMEMALLOC 0x80000u #define ___GFP_HARDWALL 0x100000u #define ___GFP_THISNODE 0x200000u #define ___GFP_ACCOUNT 0x400000u #ifdef CONFIG_LOCKDEP #define ___GFP_NOLOCKDEP 0x800000u #else #define ___GFP_NOLOCKDEP 0 #endif /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) * * Do not put any conditional on these. If necessary modify the definitions * without the underscores and use them consistently. The definitions here may * be used in bit comparisons. */ #define __GFP_DMA ((__force gfp_t)___GFP_DMA) #define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM) #define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32) #define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ #define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) /** * DOC: Page mobility and placement hints * * Page mobility and placement hints * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * These flags provide hints about how mobile the page is. Pages with similar * mobility are placed within the same pageblocks to minimise problems due * to external fragmentation. * * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be * moved by page migration during memory compaction or can be reclaimed. * * %__GFP_RECLAIMABLE is used for slab allocations that specify * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers. * * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible, * these pages will be spread between local zones to avoid all the dirty * pages being in one zone (fair zone allocation policy). * * %__GFP_HARDWALL enforces the cpuset memory allocation policy. * * %__GFP_THISNODE forces the allocation to be satisfied from the requested * node with no fallbacks or placement policy enforcements. * * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg. */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) #define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT) /** * DOC: Watermark modifiers * * Watermark modifiers -- controls access to emergency reserves * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. * For example, creating an IO context to clean pages. * * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is * high priority. Users are typically interrupt handlers. This may be * used in conjunction with %__GFP_HIGH * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed * very shortly e.g. process exiting or swapping. Users either should * be the MM or co-ordinating closely with the VM (e.g. swap over NFS). * Users of this flag have to be extremely careful to not deplete the reserve * completely and implement a throttling mechanism which controls the * consumption of the reserve based on the amount of freed memory. * Usage of a pre-allocated pool (e.g. mempool) should be always considered * before using this flag. * * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ #define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /** * DOC: Reclaim modifiers * * Reclaim modifiers * ~~~~~~~~~~~~~~~~~ * Please note that all the following flags are only applicable to sleepable * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them). * * %__GFP_IO can start physical IO. * * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the * allocator recursing into the filesystem which might already be holding * locks. * * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim. * This flag can be cleared to avoid unnecessary delays when a fallback * option is available. * * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when * the low watermark is reached and have it reclaim pages until the high * watermark is reached. A caller may wish to clear this flag when fallback * options are available and the reclaim is likely to disrupt the system. The * canonical example is THP allocation where a fallback is cheap but * reclaim/compaction may cause indirect stalls. * * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. * * The default allocator behavior depends on the request size. We have a concept * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER). * !costly allocations are too essential to fail so they are implicitly * non-failing by default (with some exceptions like OOM victims might fail so * the caller still has to check for failures) while costly requests try to be * not disruptive and back off even without invoking the OOM killer. * The following three modifiers might be used to override some of these * implicit rules * * %__GFP_NORETRY: The VM implementation will try only very lightweight * memory direct reclaim to get some memory under memory pressure (thus * it can sleep). It will avoid disruptive actions like OOM killer. The * caller must handle the failure which is quite likely to happen under * heavy memory pressure. The flag is suitable when failure can easily be * handled at small cost, such as reduced throughput * * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim * procedures that have previously failed if there is some indication * that progress has been made else where. It can wait for other * tasks to attempt high level approaches to freeing memory such as * compaction (which removes fragmentation) and page-out. * There is still a definite limit to the number of retries, but it is * a larger limit than with %__GFP_NORETRY. * Allocations with this flag may fail, but only when there is * genuinely little unused memory. While these allocations do not * directly trigger the OOM killer, their failure indicates that * the system is likely to need to use the OOM killer soon. The * caller must handle failure, but can reasonably do so by failing * a higher-level request, or completing it only in a much less * efficient manner. * If the allocation does fail, and the caller is in a position to * free some non-essential memory, doing so could benefit the system * as a whole. * * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller * cannot handle allocation failures. The allocation could block * indefinitely but will never return with failure. Testing for * failure is pointless. * New users should be evaluated carefully (and the flag should be * used only when there is no reasonable failure policy) but it is * definitely preferable to use the flag rather than opencode endless * loop around allocator. * Using this flag for costly allocations is _highly_ discouraged. */ #define __GFP_IO ((__force gfp_t)___GFP_IO) #define __GFP_FS ((__force gfp_t)___GFP_FS) #define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */ #define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */ #define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) #define __GFP_RETRY_MAYFAIL ((__force gfp_t)___GFP_RETRY_MAYFAIL) #define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) #define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /** * DOC: Action modifiers * * Action modifiers * ~~~~~~~~~~~~~~~~ * * %__GFP_NOWARN suppresses allocation failure reports. * * %__GFP_COMP address compound page metadata. * * %__GFP_ZERO returns a zeroed page on success. */ #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ #define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** * DOC: Useful GFP flag combinations * * Useful GFP flag combinations * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Useful GFP flag combinations that are commonly used. It is recommended * that subsystems start with one of these combinations and then set/clear * %__GFP_FOO flags as necessary. * * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower * watermark is applied to allow access to "atomic reserves". * The current implementation doesn't support NMI and few other strict * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT. * * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim. * * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is * accounted to kmemcg. * * %GFP_NOWAIT is for kernel allocations that should not stall for direct * reclaim, start physical IO or use any filesystem callback. * * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. * Please try to avoid using this flag directly and instead use * memalloc_noio_{save,restore} to mark the whole scope which cannot * perform any IO with a short explanation why. All allocation requests * will inherit GFP_NOIO implicitly. * * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. * Please try to avoid using this flag directly and instead use * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't * recurse into the FS layer with a short explanation why. All allocation * requests will inherit GFP_NOFS implicitly. * * %GFP_USER is for userspace allocations that also need to be directly * accessibly by the kernel or hardware. It is typically used by hardware * for buffers that are mapped to userspace (e.g. graphics) that hardware * still must DMA to. cpuset limits are enforced for these allocations. * * %GFP_DMA exists for historical reasons and should be avoided where possible. * The flags indicates that the caller requires that the lowest zone be * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but * it would require careful auditing as some users really require it and * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the * lowest zone as a type of emergency reserve. * * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit * address. * * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, * do not need to be directly accessible by the kernel but that cannot * move once in use. An example may be a hardware allocation that maps * data directly into userspace but has no addressing limitations. * * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not * need direct access to but can use kmap() when access is required. They * are expected to be movable via page reclaim or page migration. Typically, * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE. * * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They * are compound allocations that will generally fail quickly if memory is not * available and will not wake kswapd/kcompactd on failure. The _LIGHT * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) #define GFP_NOIO (__GFP_RECLAIM) #define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) #define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL) #define GFP_DMA __GFP_DMA #define GFP_DMA32 __GFP_DMA32 #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) #define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM) #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM) /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 static inline int gfp_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; /* Group based on mobility */ return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } #undef GFP_MOVABLE_MASK #undef GFP_MOVABLE_SHIFT static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) { return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } /** * gfpflags_normal_context - is gfp_flags a normal sleepable context? * @gfp_flags: gfp_flags to test * * Test whether @gfp_flags indicates that the allocation is from the * %current context and allowed to sleep. * * An allocation being allowed to block doesn't mean it owns the %current * context. When direct reclaim path tries to allocate memory, the * allocation context is nested inside whatever %current was doing at the * time of the original allocation. The nested allocation may be allowed * to block but modifying anything %current owns can corrupt the outer * context's expectations. * * %true result from this function indicates that the allocation context * can sleep and use anything that's associated with %current. */ static inline bool gfpflags_normal_context(const gfp_t gfp_flags) { return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) == __GFP_DIRECT_RECLAIM; } #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else #define OPT_ZONE_HIGHMEM ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA #define OPT_ZONE_DMA ZONE_DMA #else #define OPT_ZONE_DMA ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA32 #define OPT_ZONE_DMA32 ZONE_DMA32 #else #define OPT_ZONE_DMA32 ZONE_NORMAL #endif /* * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT * bits long and there are 16 of them to cover all possible combinations of * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM. * * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. * But GFP_MOVABLE is not only a zone specifier but also an allocation * policy. Therefore __GFP_MOVABLE plus another zone selector is valid. * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1". * * bit result * ================= * 0x0 => NORMAL * 0x1 => DMA or NORMAL * 0x2 => HIGHMEM or NORMAL * 0x3 => BAD (DMA+HIGHMEM) * 0x4 => DMA32 or NORMAL * 0x5 => BAD (DMA+DMA32) * 0x6 => BAD (HIGHMEM+DMA32) * 0x7 => BAD (HIGHMEM+DMA32+DMA) * 0x8 => NORMAL (MOVABLE+0) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) * 0xc => DMA32 or NORMAL (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) * * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms. */ #if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4 /* ZONE_DEVICE is not a valid GFP zone specifier */ #define GFP_ZONES_SHIFT 2 #else #define GFP_ZONES_SHIFT ZONES_SHIFT #endif #if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG #error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer #endif #define GFP_ZONE_TABLE ( \ (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \ | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \ | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \ | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\ | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\ ) /* * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per * entry starting with bit 0. Bit is set if the combination is not * allowed. */ #define GFP_ZONE_BAD ( \ 1 << (___GFP_DMA | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32) \ | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \ ) static inline enum zone_type gfp_zone(gfp_t flags) { enum zone_type z; int bit = (__force int) (flags & GFP_ZONEMASK); z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); return z; } /* * There is only one page-allocator function, and two main namespaces to * it. The alloc_page*() variants return 'struct page *' and as such * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ static inline int gfp_zonelist(gfp_t flags) { #ifdef CONFIG_NUMA if (unlikely(flags & __GFP_THISNODE)) return ZONELIST_NOFALLBACK; #endif return ZONELIST_FALLBACK; } /* * We get the zone list from the current node and the gfp_mask. * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones. * There are two zonelists per node, one for all zones with memory and * one containing just zones from the node the zonelist belongs to. * * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets * optimized to &contig_page_data at compile-time. */ static inline struct zonelist *node_zonelist(int nid, gfp_t flags) { return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags); } #ifndef HAVE_ARCH_FREE_PAGE static inline void arch_free_page(struct page *page, int order) { } #endif #ifndef HAVE_ARCH_ALLOC_PAGE static inline void arch_alloc_page(struct page *page, int order) { } #endif #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE static inline int arch_make_page_accessible(struct page *page) { return 0; } #endif struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask); static inline struct page * __alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) { return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL); } /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). */ static inline struct page * __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid)); return __alloc_pages(gfp_mask, order, nid); } /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, * prefer the current CPU's closest node. Otherwise node must be valid and * online. */ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return __alloc_pages_node(nid, gfp_mask, order); } #ifdef CONFIG_NUMA extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); static inline struct page * alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_current(gfp_mask, order); } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage); #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node(numa_node_id(), gfp_mask, order); } #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ alloc_pages(gfp_mask, order) #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask); void free_pages_exact(void *virt, size_t size); void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) #define __get_dma_pages(gfp_mask, order) \ __get_free_pages((gfp_mask) | GFP_DMA, (order)) extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); extern void free_unref_page(struct page *page); extern void free_unref_page_list(struct list_head *list); struct page_frag_cache; extern void __page_frag_cache_drain(struct page *page, unsigned int count); extern void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask); extern void page_frag_free(void *addr); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) void page_alloc_init(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what * GFP flags are used before interrupts are enabled. Once interrupts are * enabled, it is set to __GFP_BITS_MASK while the system is running. During * hibernation, it is used by PM to avoid I/O during memory allocation while * devices are suspended. */ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); extern void pm_restrict_gfp_mask(void); extern void pm_restore_gfp_mask(void); #ifdef CONFIG_PM_SLEEP extern bool pm_suspended_storage(void); #else static inline bool pm_suspended_storage(void) { return false; } #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #endif void free_contig_range(unsigned long pfn, unsigned int nr_pages); #ifdef CONFIG_CMA /* CMA stuff */ extern void init_cma_reserved_pageblock(struct page *page); #endif #endif /* __LINUX_GFP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * linux/drivers/char/serial_core.h * * Copyright (C) 2000 Deep Blue Solutions Ltd. */ #ifndef LINUX_SERIAL_CORE_H #define LINUX_SERIAL_CORE_H #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/console.h> #include <linux/interrupt.h> #include <linux/circ_buf.h> #include <linux/spinlock.h> #include <linux/sched.h> #include <linux/tty.h> #include <linux/mutex.h> #include <linux/sysrq.h> #include <uapi/linux/serial_core.h> #ifdef CONFIG_SERIAL_CORE_CONSOLE #define uart_console(port) \ ((port)->cons && (port)->cons->index == (port)->line) #else #define uart_console(port) ({ (void)port; 0; }) #endif struct uart_port; struct serial_struct; struct device; struct gpio_desc; /* * This structure describes all the operations that can be done on the * physical hardware. See Documentation/driver-api/serial/driver.rst for details. */ struct uart_ops { unsigned int (*tx_empty)(struct uart_port *); void (*set_mctrl)(struct uart_port *, unsigned int mctrl); unsigned int (*get_mctrl)(struct uart_port *); void (*stop_tx)(struct uart_port *); void (*start_tx)(struct uart_port *); void (*throttle)(struct uart_port *); void (*unthrottle)(struct uart_port *); void (*send_xchar)(struct uart_port *, char ch); void (*stop_rx)(struct uart_port *); void (*enable_ms)(struct uart_port *); void (*break_ctl)(struct uart_port *, int ctl); int (*startup)(struct uart_port *); void (*shutdown)(struct uart_port *); void (*flush_buffer)(struct uart_port *); void (*set_termios)(struct uart_port *, struct ktermios *new, struct ktermios *old); void (*set_ldisc)(struct uart_port *, struct ktermios *); void (*pm)(struct uart_port *, unsigned int state, unsigned int oldstate); /* * Return a string describing the type of the port */ const char *(*type)(struct uart_port *); /* * Release IO and memory resources used by the port. * This includes iounmap if necessary. */ void (*release_port)(struct uart_port *); /* * Request IO and memory resources used by the port. * This includes iomapping the port if necessary. */ int (*request_port)(struct uart_port *); void (*config_port)(struct uart_port *, int); int (*verify_port)(struct uart_port *, struct serial_struct *); int (*ioctl)(struct uart_port *, unsigned int, unsigned long); #ifdef CONFIG_CONSOLE_POLL int (*poll_init)(struct uart_port *); void (*poll_put_char)(struct uart_port *, unsigned char); int (*poll_get_char)(struct uart_port *); #endif }; #define NO_POLL_CHAR 0x00ff0000 #define UART_CONFIG_TYPE (1 << 0) #define UART_CONFIG_IRQ (1 << 1) struct uart_icount { __u32 cts; __u32 dsr; __u32 rng; __u32 dcd; __u32 rx; __u32 tx; __u32 frame; __u32 overrun; __u32 parity; __u32 brk; __u32 buf_overrun; }; typedef unsigned int __bitwise upf_t; typedef unsigned int __bitwise upstat_t; struct uart_port { spinlock_t lock; /* port lock */ unsigned long iobase; /* in/out[bwl] */ unsigned char __iomem *membase; /* read/write[bwl] */ unsigned int (*serial_in)(struct uart_port *, int); void (*serial_out)(struct uart_port *, int, int); void (*set_termios)(struct uart_port *, struct ktermios *new, struct ktermios *old); void (*set_ldisc)(struct uart_port *, struct ktermios *); unsigned int (*get_mctrl)(struct uart_port *); void (*set_mctrl)(struct uart_port *, unsigned int); unsigned int (*get_divisor)(struct uart_port *, unsigned int baud, unsigned int *frac); void (*set_divisor)(struct uart_port *, unsigned int baud, unsigned int quot, unsigned int quot_frac); int (*startup)(struct uart_port *port); void (*shutdown)(struct uart_port *port); void (*throttle)(struct uart_port *port); void (*unthrottle)(struct uart_port *port); int (*handle_irq)(struct uart_port *); void (*pm)(struct uart_port *, unsigned int state, unsigned int old); void (*handle_break)(struct uart_port *); int (*rs485_config)(struct uart_port *, struct serial_rs485 *rs485); int (*iso7816_config)(struct uart_port *, struct serial_iso7816 *iso7816); unsigned int irq; /* irq number */ unsigned long irqflags; /* irq flags */ unsigned int uartclk; /* base uart clock */ unsigned int fifosize; /* tx fifo size */ unsigned char x_char; /* xon/xoff char */ unsigned char regshift; /* reg offset shift */ unsigned char iotype; /* io access style */ unsigned char quirks; /* internal quirks */ #define UPIO_PORT (SERIAL_IO_PORT) /* 8b I/O port access */ #define UPIO_HUB6 (SERIAL_IO_HUB6) /* Hub6 ISA card */ #define UPIO_MEM (SERIAL_IO_MEM) /* driver-specific */ #define UPIO_MEM32 (SERIAL_IO_MEM32) /* 32b little endian */ #define UPIO_AU (SERIAL_IO_AU) /* Au1x00 and RT288x type IO */ #define UPIO_TSI (SERIAL_IO_TSI) /* Tsi108/109 type IO */ #define UPIO_MEM32BE (SERIAL_IO_MEM32BE) /* 32b big endian */ #define UPIO_MEM16 (SERIAL_IO_MEM16) /* 16b little endian */ /* quirks must be updated while holding port mutex */ #define UPQ_NO_TXEN_TEST BIT(0) unsigned int read_status_mask; /* driver specific */ unsigned int ignore_status_mask; /* driver specific */ struct uart_state *state; /* pointer to parent state */ struct uart_icount icount; /* statistics */ struct console *cons; /* struct console, if any */ /* flags must be updated while holding port mutex */ upf_t flags; /* * These flags must be equivalent to the flags defined in * include/uapi/linux/tty_flags.h which are the userspace definitions * assigned from the serial_struct flags in uart_set_info() * [for bit definitions in the UPF_CHANGE_MASK] * * Bits [0..UPF_LAST_USER] are userspace defined/visible/changeable * The remaining bits are serial-core specific and not modifiable by * userspace. */ #define UPF_FOURPORT ((__force upf_t) ASYNC_FOURPORT /* 1 */ ) #define UPF_SAK ((__force upf_t) ASYNC_SAK /* 2 */ ) #define UPF_SPD_HI ((__force upf_t) ASYNC_SPD_HI /* 4 */ ) #define UPF_SPD_VHI ((__force upf_t) ASYNC_SPD_VHI /* 5 */ ) #define UPF_SPD_CUST ((__force upf_t) ASYNC_SPD_CUST /* 0x0030 */ ) #define UPF_SPD_WARP ((__force upf_t) ASYNC_SPD_WARP /* 0x1010 */ ) #define UPF_SPD_MASK ((__force upf_t) ASYNC_SPD_MASK /* 0x1030 */ ) #define UPF_SKIP_TEST ((__force upf_t) ASYNC_SKIP_TEST /* 6 */ ) #define UPF_AUTO_IRQ ((__force upf_t) ASYNC_AUTO_IRQ /* 7 */ ) #define UPF_HARDPPS_CD ((__force upf_t) ASYNC_HARDPPS_CD /* 11 */ ) #define UPF_SPD_SHI ((__force upf_t) ASYNC_SPD_SHI /* 12 */ ) #define UPF_LOW_LATENCY ((__force upf_t) ASYNC_LOW_LATENCY /* 13 */ ) #define UPF_BUGGY_UART ((__force upf_t) ASYNC_BUGGY_UART /* 14 */ ) #define UPF_MAGIC_MULTIPLIER ((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ ) #define UPF_NO_THRE_TEST ((__force upf_t) (1 << 19)) /* Port has hardware-assisted h/w flow control */ #define UPF_AUTO_CTS ((__force upf_t) (1 << 20)) #define UPF_AUTO_RTS ((__force upf_t) (1 << 21)) #define UPF_HARD_FLOW ((__force upf_t) (UPF_AUTO_CTS | UPF_AUTO_RTS)) /* Port has hardware-assisted s/w flow control */ #define UPF_SOFT_FLOW ((__force upf_t) (1 << 22)) #define UPF_CONS_FLOW ((__force upf_t) (1 << 23)) #define UPF_SHARE_IRQ ((__force upf_t) (1 << 24)) #define UPF_EXAR_EFR ((__force upf_t) (1 << 25)) #define UPF_BUG_THRE ((__force upf_t) (1 << 26)) /* The exact UART type is known and should not be probed. */ #define UPF_FIXED_TYPE ((__force upf_t) (1 << 27)) #define UPF_BOOT_AUTOCONF ((__force upf_t) (1 << 28)) #define UPF_FIXED_PORT ((__force upf_t) (1 << 29)) #define UPF_DEAD ((__force upf_t) (1 << 30)) #define UPF_IOREMAP ((__force upf_t) (1 << 31)) #define __UPF_CHANGE_MASK 0x17fff #define UPF_CHANGE_MASK ((__force upf_t) __UPF_CHANGE_MASK) #define UPF_USR_MASK ((__force upf_t) (UPF_SPD_MASK|UPF_LOW_LATENCY)) #if __UPF_CHANGE_MASK > ASYNC_FLAGS #error Change mask not equivalent to userspace-visible bit defines #endif /* * Must hold termios_rwsem, port mutex and port lock to change; * can hold any one lock to read. */ upstat_t status; #define UPSTAT_CTS_ENABLE ((__force upstat_t) (1 << 0)) #define UPSTAT_DCD_ENABLE ((__force upstat_t) (1 << 1)) #define UPSTAT_AUTORTS ((__force upstat_t) (1 << 2)) #define UPSTAT_AUTOCTS ((__force upstat_t) (1 << 3)) #define UPSTAT_AUTOXOFF ((__force upstat_t) (1 << 4)) #define UPSTAT_SYNC_FIFO ((__force upstat_t) (1 << 5)) int hw_stopped; /* sw-assisted CTS flow state */ unsigned int mctrl; /* current modem ctrl settings */ unsigned int timeout; /* character-based timeout */ unsigned int type; /* port type */ const struct uart_ops *ops; unsigned int custom_divisor; unsigned int line; /* port index */ unsigned int minor; resource_size_t mapbase; /* for ioremap */ resource_size_t mapsize; struct device *dev; /* parent device */ unsigned long sysrq; /* sysrq timeout */ unsigned int sysrq_ch; /* char for sysrq */ unsigned char has_sysrq; unsigned char sysrq_seq; /* index in sysrq_toggle_seq */ unsigned char hub6; /* this should be in the 8250 driver */ unsigned char suspended; unsigned char console_reinit; const char *name; /* port name */ struct attribute_group *attr_group; /* port specific attributes */ const struct attribute_group **tty_groups; /* all attributes (serial core use only) */ struct serial_rs485 rs485; struct gpio_desc *rs485_term_gpio; /* enable RS485 bus termination */ struct serial_iso7816 iso7816; void *private_data; /* generic platform data pointer */ }; static inline int serial_port_in(struct uart_port *up, int offset) { return up->serial_in(up, offset); } static inline void serial_port_out(struct uart_port *up, int offset, int value) { up->serial_out(up, offset, value); } /** * enum uart_pm_state - power states for UARTs * @UART_PM_STATE_ON: UART is powered, up and operational * @UART_PM_STATE_OFF: UART is powered off * @UART_PM_STATE_UNDEFINED: sentinel */ enum uart_pm_state { UART_PM_STATE_ON = 0, UART_PM_STATE_OFF = 3, /* number taken from ACPI */ UART_PM_STATE_UNDEFINED, }; /* * This is the state information which is persistent across opens. */ struct uart_state { struct tty_port port; enum uart_pm_state pm_state; struct circ_buf xmit; atomic_t refcount; wait_queue_head_t remove_wait; struct uart_port *uart_port; }; #define UART_XMIT_SIZE PAGE_SIZE /* number of characters left in xmit buffer before we ask for more */ #define WAKEUP_CHARS 256 struct module; struct tty_driver; struct uart_driver { struct module *owner; const char *driver_name; const char *dev_name; int major; int minor; int nr; struct console *cons; /* * these are private; the low level driver should not * touch these; they should be initialised to NULL */ struct uart_state *state; struct tty_driver *tty_driver; }; void uart_write_wakeup(struct uart_port *port); /* * Baud rate helpers. */ void uart_update_timeout(struct uart_port *port, unsigned int cflag, unsigned int baud); unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios, struct ktermios *old, unsigned int min, unsigned int max); unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud); /* Base timer interval for polling */ static inline int uart_poll_timeout(struct uart_port *port) { int timeout = port->timeout; return timeout > 6 ? (timeout / 2 - 2) : 1; } /* * Console helpers. */ struct earlycon_device { struct console *con; struct uart_port port; char options[16]; /* e.g., 115200n8 */ unsigned int baud; }; struct earlycon_id { char name[15]; char name_term; /* In case compiler didn't '\0' term name */ char compatible[128]; int (*setup)(struct earlycon_device *, const char *options); }; extern const struct earlycon_id *__earlycon_table[]; extern const struct earlycon_id *__earlycon_table_end[]; #if defined(CONFIG_SERIAL_EARLYCON) && !defined(MODULE) #define EARLYCON_USED_OR_UNUSED __used #else #define EARLYCON_USED_OR_UNUSED __maybe_unused #endif #define _OF_EARLYCON_DECLARE(_name, compat, fn, unique_id) \ static const struct earlycon_id unique_id \ EARLYCON_USED_OR_UNUSED __initconst \ = { .name = __stringify(_name), \ .compatible = compat, \ .setup = fn }; \ static const struct earlycon_id EARLYCON_USED_OR_UNUSED \ __section("__earlycon_table") \ * const __PASTE(__p, unique_id) = &unique_id #define OF_EARLYCON_DECLARE(_name, compat, fn) \ _OF_EARLYCON_DECLARE(_name, compat, fn, \ __UNIQUE_ID(__earlycon_##_name)) #define EARLYCON_DECLARE(_name, fn) OF_EARLYCON_DECLARE(_name, "", fn) extern int of_setup_earlycon(const struct earlycon_id *match, unsigned long node, const char *options); #ifdef CONFIG_SERIAL_EARLYCON extern bool earlycon_acpi_spcr_enable __initdata; int setup_earlycon(char *buf); #else static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED; static inline int setup_earlycon(char *buf) { return 0; } #endif struct uart_port *uart_get_console(struct uart_port *ports, int nr, struct console *c); int uart_parse_earlycon(char *p, unsigned char *iotype, resource_size_t *addr, char **options); void uart_parse_options(const char *options, int *baud, int *parity, int *bits, int *flow); int uart_set_options(struct uart_port *port, struct console *co, int baud, int parity, int bits, int flow); struct tty_driver *uart_console_device(struct console *co, int *index); void uart_console_write(struct uart_port *port, const char *s, unsigned int count, void (*putchar)(struct uart_port *, int)); /* * Port/driver registration/removal */ int uart_register_driver(struct uart_driver *uart); void uart_unregister_driver(struct uart_driver *uart); int uart_add_one_port(struct uart_driver *reg, struct uart_port *port); int uart_remove_one_port(struct uart_driver *reg, struct uart_port *port); int uart_match_port(struct uart_port *port1, struct uart_port *port2); /* * Power Management */ int uart_suspend_port(struct uart_driver *reg, struct uart_port *port); int uart_resume_port(struct uart_driver *reg, struct uart_port *port); #define uart_circ_empty(circ) ((circ)->head == (circ)->tail) #define uart_circ_clear(circ) ((circ)->head = (circ)->tail = 0) #define uart_circ_chars_pending(circ) \ (CIRC_CNT((circ)->head, (circ)->tail, UART_XMIT_SIZE)) #define uart_circ_chars_free(circ) \ (CIRC_SPACE((circ)->head, (circ)->tail, UART_XMIT_SIZE)) static inline int uart_tx_stopped(struct uart_port *port) { struct tty_struct *tty = port->state->port.tty; if ((tty && tty->stopped) || port->hw_stopped) return 1; return 0; } static inline bool uart_cts_enabled(struct uart_port *uport) { return !!(uport->status & UPSTAT_CTS_ENABLE); } static inline bool uart_softcts_mode(struct uart_port *uport) { upstat_t mask = UPSTAT_CTS_ENABLE | UPSTAT_AUTOCTS; return ((uport->status & mask) == UPSTAT_CTS_ENABLE); } /* * The following are helper functions for the low level drivers. */ extern void uart_handle_dcd_change(struct uart_port *uport, unsigned int status); extern void uart_handle_cts_change(struct uart_port *uport, unsigned int status); extern void uart_insert_char(struct uart_port *port, unsigned int status, unsigned int overrun, unsigned int ch, unsigned int flag); #ifdef CONFIG_MAGIC_SYSRQ_SERIAL #define SYSRQ_TIMEOUT (HZ * 5) bool uart_try_toggle_sysrq(struct uart_port *port, unsigned int ch); static inline int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) { if (!port->sysrq) return 0; if (ch && time_before(jiffies, port->sysrq)) { if (sysrq_mask()) { handle_sysrq(ch); port->sysrq = 0; return 1; } if (uart_try_toggle_sysrq(port, ch)) return 1; } port->sysrq = 0; return 0; } static inline int uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch) { if (!port->sysrq) return 0; if (ch && time_before(jiffies, port->sysrq)) { if (sysrq_mask()) { port->sysrq_ch = ch; port->sysrq = 0; return 1; } if (uart_try_toggle_sysrq(port, ch)) return 1; } port->sysrq = 0; return 0; } static inline void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags) { int sysrq_ch; if (!port->has_sysrq) { spin_unlock_irqrestore(&port->lock, irqflags); return; } sysrq_ch = port->sysrq_ch; port->sysrq_ch = 0; spin_unlock_irqrestore(&port->lock, irqflags); if (sysrq_ch) handle_sysrq(sysrq_ch); } #else /* CONFIG_MAGIC_SYSRQ_SERIAL */ static inline int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) { return 0; } static inline int uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch) { return 0; } static inline void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags) { spin_unlock_irqrestore(&port->lock, irqflags); } #endif /* CONFIG_MAGIC_SYSRQ_SERIAL */ /* * We do the SysRQ and SAK checking like this... */ static inline int uart_handle_break(struct uart_port *port) { struct uart_state *state = port->state; if (port->handle_break) port->handle_break(port); #ifdef CONFIG_MAGIC_SYSRQ_SERIAL if (port->has_sysrq && uart_console(port)) { if (!port->sysrq) { port->sysrq = jiffies + SYSRQ_TIMEOUT; return 1; } port->sysrq = 0; } #endif if (port->flags & UPF_SAK) do_SAK(state->port.tty); return 0; } /* * UART_ENABLE_MS - determine if port should enable modem status irqs */ #define UART_ENABLE_MS(port,cflag) ((port)->flags & UPF_HARDPPS_CD || \ (cflag) & CRTSCTS || \ !((cflag) & CLOCAL)) int uart_get_rs485_mode(struct uart_port *port); #endif /* LINUX_SERIAL_CORE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_H #define _LINUX_TTY_H #include <linux/fs.h> #include <linux/major.h> #include <linux/termios.h> #include <linux/workqueue.h> #include <linux/tty_driver.h> #include <linux/tty_ldisc.h> #include <linux/mutex.h> #include <linux/tty_flags.h> #include <linux/seq_file.h> #include <uapi/linux/tty.h> #include <linux/rwsem.h> #include <linux/llist.h> /* * Lock subclasses for tty locks * * TTY_LOCK_NORMAL is for normal ttys and master ptys. * TTY_LOCK_SLAVE is for slave ptys only. * * Lock subclasses are necessary for handling nested locking with pty pairs. * tty locks which use nested locking: * * legacy_mutex - Nested tty locks are necessary for releasing pty pairs. * The stable lock order is master pty first, then slave pty. * termios_rwsem - The stable lock order is tty_buffer lock->termios_rwsem. * Subclassing this lock enables the slave pty to hold its * termios_rwsem when claiming the master tty_buffer lock. * tty_buffer lock - slave ptys can claim nested buffer lock when handling * signal chars. The stable lock order is slave pty, then * master. */ enum { TTY_LOCK_NORMAL = 0, TTY_LOCK_SLAVE, }; /* * (Note: the *_driver.minor_start values 1, 64, 128, 192 are * hardcoded at present.) */ #define NR_UNIX98_PTY_DEFAULT 4096 /* Default maximum for Unix98 ptys */ #define NR_UNIX98_PTY_RESERVE 1024 /* Default reserve for main devpts */ #define NR_UNIX98_PTY_MAX (1 << MINORBITS) /* Absolute limit */ /* * This character is the same as _POSIX_VDISABLE: it cannot be used as * a c_cc[] character, but indicates that a particular special character * isn't in use (eg VINTR has no character etc) */ #define __DISABLED_CHAR '\0' struct tty_buffer { union { struct tty_buffer *next; struct llist_node free; }; int used; int size; int commit; int read; int flags; /* Data points here */ unsigned long data[]; }; /* Values for .flags field of tty_buffer */ #define TTYB_NORMAL 1 /* buffer has no flags buffer */ static inline unsigned char *char_buf_ptr(struct tty_buffer *b, int ofs) { return ((unsigned char *)b->data) + ofs; } static inline char *flag_buf_ptr(struct tty_buffer *b, int ofs) { return (char *)char_buf_ptr(b, ofs) + b->size; } struct tty_bufhead { struct tty_buffer *head; /* Queue head */ struct work_struct work; struct mutex lock; atomic_t priority; struct tty_buffer sentinel; struct llist_head free; /* Free queue head */ atomic_t mem_used; /* In-use buffers excluding free list */ int mem_limit; struct tty_buffer *tail; /* Active buffer */ }; /* * When a break, frame error, or parity error happens, these codes are * stuffed into the flags buffer. */ #define TTY_NORMAL 0 #define TTY_BREAK 1 #define TTY_FRAME 2 #define TTY_PARITY 3 #define TTY_OVERRUN 4 #define INTR_CHAR(tty) ((tty)->termios.c_cc[VINTR]) #define QUIT_CHAR(tty) ((tty)->termios.c_cc[VQUIT]) #define ERASE_CHAR(tty) ((tty)->termios.c_cc[VERASE]) #define KILL_CHAR(tty) ((tty)->termios.c_cc[VKILL]) #define EOF_CHAR(tty) ((tty)->termios.c_cc[VEOF]) #define TIME_CHAR(tty) ((tty)->termios.c_cc[VTIME]) #define MIN_CHAR(tty) ((tty)->termios.c_cc[VMIN]) #define SWTC_CHAR(tty) ((tty)->termios.c_cc[VSWTC]) #define START_CHAR(tty) ((tty)->termios.c_cc[VSTART]) #define STOP_CHAR(tty) ((tty)->termios.c_cc[VSTOP]) #define SUSP_CHAR(tty) ((tty)->termios.c_cc[VSUSP]) #define EOL_CHAR(tty) ((tty)->termios.c_cc[VEOL]) #define REPRINT_CHAR(tty) ((tty)->termios.c_cc[VREPRINT]) #define DISCARD_CHAR(tty) ((tty)->termios.c_cc[VDISCARD]) #define WERASE_CHAR(tty) ((tty)->termios.c_cc[VWERASE]) #define LNEXT_CHAR(tty) ((tty)->termios.c_cc[VLNEXT]) #define EOL2_CHAR(tty) ((tty)->termios.c_cc[VEOL2]) #define _I_FLAG(tty, f) ((tty)->termios.c_iflag & (f)) #define _O_FLAG(tty, f) ((tty)->termios.c_oflag & (f)) #define _C_FLAG(tty, f) ((tty)->termios.c_cflag & (f)) #define _L_FLAG(tty, f) ((tty)->termios.c_lflag & (f)) #define I_IGNBRK(tty) _I_FLAG((tty), IGNBRK) #define I_BRKINT(tty) _I_FLAG((tty), BRKINT) #define I_IGNPAR(tty) _I_FLAG((tty), IGNPAR) #define I_PARMRK(tty) _I_FLAG((tty), PARMRK) #define I_INPCK(tty) _I_FLAG((tty), INPCK) #define I_ISTRIP(tty) _I_FLAG((tty), ISTRIP) #define I_INLCR(tty) _I_FLAG((tty), INLCR) #define I_IGNCR(tty) _I_FLAG((tty), IGNCR) #define I_ICRNL(tty) _I_FLAG((tty), ICRNL) #define I_IUCLC(tty) _I_FLAG((tty), IUCLC) #define I_IXON(tty) _I_FLAG((tty), IXON) #define I_IXANY(tty) _I_FLAG((tty), IXANY) #define I_IXOFF(tty) _I_FLAG((tty), IXOFF) #define I_IMAXBEL(tty) _I_FLAG((tty), IMAXBEL) #define I_IUTF8(tty) _I_FLAG((tty), IUTF8) #define O_OPOST(tty) _O_FLAG((tty), OPOST) #define O_OLCUC(tty) _O_FLAG((tty), OLCUC) #define O_ONLCR(tty) _O_FLAG((tty), ONLCR) #define O_OCRNL(tty) _O_FLAG((tty), OCRNL) #define O_ONOCR(tty) _O_FLAG((tty), ONOCR) #define O_ONLRET(tty) _O_FLAG((tty), ONLRET) #define O_OFILL(tty) _O_FLAG((tty), OFILL) #define O_OFDEL(tty) _O_FLAG((tty), OFDEL) #define O_NLDLY(tty) _O_FLAG((tty), NLDLY) #define O_CRDLY(tty) _O_FLAG((tty), CRDLY) #define O_TABDLY(tty) _O_FLAG((tty), TABDLY) #define O_BSDLY(tty) _O_FLAG((tty), BSDLY) #define O_VTDLY(tty) _O_FLAG((tty), VTDLY) #define O_FFDLY(tty) _O_FLAG((tty), FFDLY) #define C_BAUD(tty) _C_FLAG((tty), CBAUD) #define C_CSIZE(tty) _C_FLAG((tty), CSIZE) #define C_CSTOPB(tty) _C_FLAG((tty), CSTOPB) #define C_CREAD(tty) _C_FLAG((tty), CREAD) #define C_PARENB(tty) _C_FLAG((tty), PARENB) #define C_PARODD(tty) _C_FLAG((tty), PARODD) #define C_HUPCL(tty) _C_FLAG((tty), HUPCL) #define C_CLOCAL(tty) _C_FLAG((tty), CLOCAL) #define C_CIBAUD(tty) _C_FLAG((tty), CIBAUD) #define C_CRTSCTS(tty) _C_FLAG((tty), CRTSCTS) #define C_CMSPAR(tty) _C_FLAG((tty), CMSPAR) #define L_ISIG(tty) _L_FLAG((tty), ISIG) #define L_ICANON(tty) _L_FLAG((tty), ICANON) #define L_XCASE(tty) _L_FLAG((tty), XCASE) #define L_ECHO(tty) _L_FLAG((tty), ECHO) #define L_ECHOE(tty) _L_FLAG((tty), ECHOE) #define L_ECHOK(tty) _L_FLAG((tty), ECHOK) #define L_ECHONL(tty) _L_FLAG((tty), ECHONL) #define L_NOFLSH(tty) _L_FLAG((tty), NOFLSH) #define L_TOSTOP(tty) _L_FLAG((tty), TOSTOP) #define L_ECHOCTL(tty) _L_FLAG((tty), ECHOCTL) #define L_ECHOPRT(tty) _L_FLAG((tty), ECHOPRT) #define L_ECHOKE(tty) _L_FLAG((tty), ECHOKE) #define L_FLUSHO(tty) _L_FLAG((tty), FLUSHO) #define L_PENDIN(tty) _L_FLAG((tty), PENDIN) #define L_IEXTEN(tty) _L_FLAG((tty), IEXTEN) #define L_EXTPROC(tty) _L_FLAG((tty), EXTPROC) struct device; struct signal_struct; /* * Port level information. Each device keeps its own port level information * so provide a common structure for those ports wanting to use common support * routines. * * The tty port has a different lifetime to the tty so must be kept apart. * In addition be careful as tty -> port mappings are valid for the life * of the tty object but in many cases port -> tty mappings are valid only * until a hangup so don't use the wrong path. */ struct tty_port; struct tty_port_operations { /* Return 1 if the carrier is raised */ int (*carrier_raised)(struct tty_port *port); /* Control the DTR line */ void (*dtr_rts)(struct tty_port *port, int raise); /* Called when the last close completes or a hangup finishes IFF the port was initialized. Do not use to free resources. Called under the port mutex to serialize against activate/shutdowns */ void (*shutdown)(struct tty_port *port); /* Called under the port mutex from tty_port_open, serialized using the port mutex */ /* FIXME: long term getting the tty argument *out* of this would be good for consoles */ int (*activate)(struct tty_port *port, struct tty_struct *tty); /* Called on the final put of a port */ void (*destruct)(struct tty_port *port); }; struct tty_port_client_operations { int (*receive_buf)(struct tty_port *port, const unsigned char *, const unsigned char *, size_t); void (*write_wakeup)(struct tty_port *port); }; extern const struct tty_port_client_operations tty_port_default_client_ops; struct tty_port { struct tty_bufhead buf; /* Locked internally */ struct tty_struct *tty; /* Back pointer */ struct tty_struct *itty; /* internal back ptr */ const struct tty_port_operations *ops; /* Port operations */ const struct tty_port_client_operations *client_ops; /* Port client operations */ spinlock_t lock; /* Lock protecting tty field */ int blocked_open; /* Waiting to open */ int count; /* Usage count */ wait_queue_head_t open_wait; /* Open waiters */ wait_queue_head_t delta_msr_wait; /* Modem status change */ unsigned long flags; /* User TTY flags ASYNC_ */ unsigned long iflags; /* Internal flags TTY_PORT_ */ unsigned char console:1, /* port is a console */ low_latency:1; /* optional: tune for latency */ struct mutex mutex; /* Locking */ struct mutex buf_mutex; /* Buffer alloc lock */ unsigned char *xmit_buf; /* Optional buffer */ unsigned int close_delay; /* Close port delay */ unsigned int closing_wait; /* Delay for output */ int drain_delay; /* Set to zero if no pure time based drain is needed else set to size of fifo */ struct kref kref; /* Ref counter */ void *client_data; }; /* tty_port::iflags bits -- use atomic bit ops */ #define TTY_PORT_INITIALIZED 0 /* device is initialized */ #define TTY_PORT_SUSPENDED 1 /* device is suspended */ #define TTY_PORT_ACTIVE 2 /* device is open */ /* * uart drivers: use the uart_port::status field and the UPSTAT_* defines * for s/w-based flow control steering and carrier detection status */ #define TTY_PORT_CTS_FLOW 3 /* h/w flow control enabled */ #define TTY_PORT_CHECK_CD 4 /* carrier detect enabled */ #define TTY_PORT_KOPENED 5 /* device exclusively opened by kernel */ /* * Where all of the state associated with a tty is kept while the tty * is open. Since the termios state should be kept even if the tty * has been closed --- for things like the baud rate, etc --- it is * not stored here, but rather a pointer to the real state is stored * here. Possible the winsize structure should have the same * treatment, but (1) the default 80x24 is usually right and (2) it's * most often used by a windowing system, which will set the correct * size each time the window is created or resized anyway. * - TYT, 9/14/92 */ struct tty_operations; struct tty_struct { int magic; struct kref kref; struct device *dev; struct tty_driver *driver; const struct tty_operations *ops; int index; /* Protects ldisc changes: Lock tty not pty */ struct ld_semaphore ldisc_sem; struct tty_ldisc *ldisc; struct mutex atomic_write_lock; struct mutex legacy_mutex; struct mutex throttle_mutex; struct rw_semaphore termios_rwsem; struct mutex winsize_mutex; spinlock_t ctrl_lock; spinlock_t flow_lock; /* Termios values are protected by the termios rwsem */ struct ktermios termios, termios_locked; char name[64]; struct pid *pgrp; /* Protected by ctrl lock */ /* * Writes protected by both ctrl lock and legacy mutex, readers must use * at least one of them. */ struct pid *session; unsigned long flags; int count; struct winsize winsize; /* winsize_mutex */ unsigned long stopped:1, /* flow_lock */ flow_stopped:1, unused:BITS_PER_LONG - 2; int hw_stopped; unsigned long ctrl_status:8, /* ctrl_lock */ packet:1, unused_ctrl:BITS_PER_LONG - 9; unsigned int receive_room; /* Bytes free for queue */ int flow_change; struct tty_struct *link; struct fasync_struct *fasync; wait_queue_head_t write_wait; wait_queue_head_t read_wait; struct work_struct hangup_work; void *disc_data; void *driver_data; spinlock_t files_lock; /* protects tty_files list */ struct list_head tty_files; #define N_TTY_BUF_SIZE 4096 int closing; unsigned char *write_buf; int write_cnt; /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; struct tty_port *port; } __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { struct tty_struct *tty; struct file *file; struct list_head list; }; /* tty magic number */ #define TTY_MAGIC 0x5401 /* * These bits are used in the flags field of the tty structure. * * So that interrupts won't be able to mess up the queues, * copy_to_cooked must be atomic with respect to itself, as must * tty->write. Thus, you must use the inline functions set_bit() and * clear_bit() to make things atomic. */ #define TTY_THROTTLED 0 /* Call unthrottle() at threshold min */ #define TTY_IO_ERROR 1 /* Cause an I/O error (may be no ldisc too) */ #define TTY_OTHER_CLOSED 2 /* Other side (if any) has closed */ #define TTY_EXCLUSIVE 3 /* Exclusive open mode */ #define TTY_DO_WRITE_WAKEUP 5 /* Call write_wakeup after queuing new */ #define TTY_LDISC_OPEN 11 /* Line discipline is open */ #define TTY_PTY_LOCK 16 /* pty private */ #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ #define TTY_HUPPED 18 /* Post driver->hangup() */ #define TTY_HUPPING 19 /* Hangup in progress */ #define TTY_LDISC_CHANGING 20 /* Change pending - non-block IO */ #define TTY_LDISC_HALTED 22 /* Line discipline is halted */ /* Values for tty->flow_change */ #define TTY_THROTTLE_SAFE 1 #define TTY_UNTHROTTLE_SAFE 2 static inline void __tty_set_flow_change(struct tty_struct *tty, int val) { tty->flow_change = val; } static inline void tty_set_flow_change(struct tty_struct *tty, int val) { tty->flow_change = val; smp_mb(); } static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file) { return file->f_flags & O_NONBLOCK || test_bit(TTY_LDISC_CHANGING, &tty->flags); } static inline bool tty_io_error(struct tty_struct *tty) { return test_bit(TTY_IO_ERROR, &tty->flags); } static inline bool tty_throttled(struct tty_struct *tty) { return test_bit(TTY_THROTTLED, &tty->flags); } #ifdef CONFIG_TTY extern void tty_kref_put(struct tty_struct *tty); extern struct pid *tty_get_pgrp(struct tty_struct *tty); extern void tty_vhangup_self(void); extern void disassociate_ctty(int priv); extern dev_t tty_devnum(struct tty_struct *tty); extern void proc_clear_tty(struct task_struct *p); extern struct tty_struct *get_current_tty(void); /* tty_io.c */ extern int __init tty_init(void); extern const char *tty_name(const struct tty_struct *tty); extern struct tty_struct *tty_kopen(dev_t device); extern void tty_kclose(struct tty_struct *tty); extern int tty_dev_name_to_number(const char *name, dev_t *number); extern int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout); extern void tty_ldisc_unlock(struct tty_struct *tty); extern ssize_t redirected_tty_write(struct kiocb *, struct iov_iter *); #else static inline void tty_kref_put(struct tty_struct *tty) { } static inline struct pid *tty_get_pgrp(struct tty_struct *tty) { return NULL; } static inline void tty_vhangup_self(void) { } static inline void disassociate_ctty(int priv) { } static inline dev_t tty_devnum(struct tty_struct *tty) { return 0; } static inline void proc_clear_tty(struct task_struct *p) { } static inline struct tty_struct *get_current_tty(void) { return NULL; } /* tty_io.c */ static inline int __init tty_init(void) { return 0; } static inline const char *tty_name(const struct tty_struct *tty) { return "(none)"; } static inline struct tty_struct *tty_kopen(dev_t device) { return ERR_PTR(-ENODEV); } static inline void tty_kclose(struct tty_struct *tty) { } static inline int tty_dev_name_to_number(const char *name, dev_t *number) { return -ENOTSUPP; } #endif extern struct ktermios tty_std_termios; extern int vcs_init(void); extern struct class *tty_class; /** * tty_kref_get - get a tty reference * @tty: tty device * * Return a new reference to a tty object. The caller must hold * sufficient locks/counts to ensure that their existing reference cannot * go away */ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) { if (tty) kref_get(&tty->kref); return tty; } extern const char *tty_driver_name(const struct tty_struct *tty); extern void tty_wait_until_sent(struct tty_struct *tty, long timeout); extern int __tty_check_change(struct tty_struct *tty, int sig); extern int tty_check_change(struct tty_struct *tty); extern void __stop_tty(struct tty_struct *tty); extern void stop_tty(struct tty_struct *tty); extern void __start_tty(struct tty_struct *tty); extern void start_tty(struct tty_struct *tty); extern int tty_register_driver(struct tty_driver *driver); extern int tty_unregister_driver(struct tty_driver *driver); extern struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev); extern struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); extern void tty_unregister_device(struct tty_driver *driver, unsigned index); extern void tty_write_message(struct tty_struct *tty, char *msg); extern int tty_send_xchar(struct tty_struct *tty, char ch); extern int tty_put_char(struct tty_struct *tty, unsigned char c); extern int tty_chars_in_buffer(struct tty_struct *tty); extern int tty_write_room(struct tty_struct *tty); extern void tty_driver_flush_buffer(struct tty_struct *tty); extern void tty_throttle(struct tty_struct *tty); extern void tty_unthrottle(struct tty_struct *tty); extern int tty_throttle_safe(struct tty_struct *tty); extern int tty_unthrottle_safe(struct tty_struct *tty); extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws); extern int is_current_pgrp_orphaned(void); extern void tty_hangup(struct tty_struct *tty); extern void tty_vhangup(struct tty_struct *tty); extern void tty_vhangup_session(struct tty_struct *tty); extern int tty_hung_up_p(struct file *filp); extern void do_SAK(struct tty_struct *tty); extern void __do_SAK(struct tty_struct *tty); extern void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty); extern int tty_signal_session_leader(struct tty_struct *tty, int exit_session); extern void session_clear_tty(struct pid *session); extern void no_tty(void); extern void tty_buffer_free_all(struct tty_port *port); extern void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld); extern void tty_buffer_init(struct tty_port *port); extern void tty_buffer_set_lock_subclass(struct tty_port *port); extern bool tty_buffer_restart_work(struct tty_port *port); extern bool tty_buffer_cancel_work(struct tty_port *port); extern void tty_buffer_flush_work(struct tty_port *port); extern speed_t tty_termios_baud_rate(struct ktermios *termios); extern speed_t tty_termios_input_baud_rate(struct ktermios *termios); extern void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, speed_t obaud); extern void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud); /** * tty_get_baud_rate - get tty bit rates * @tty: tty to query * * Returns the baud rate as an integer for this terminal. The * termios lock must be held by the caller and the terminal bit * flags may be updated. * * Locking: none */ static inline speed_t tty_get_baud_rate(struct tty_struct *tty) { return tty_termios_baud_rate(&tty->termios); } extern void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old); extern int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b); extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *); extern void tty_ldisc_deref(struct tty_ldisc *); extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *); extern void tty_ldisc_hangup(struct tty_struct *tty, bool reset); extern int tty_ldisc_reinit(struct tty_struct *tty, int disc); extern const struct seq_operations tty_ldiscs_seq_ops; extern void tty_wakeup(struct tty_struct *tty); extern void tty_ldisc_flush(struct tty_struct *tty); extern long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg); extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); extern long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty, struct file *file, unsigned int cmd, unsigned long arg); extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg); extern void tty_default_fops(struct file_operations *fops); extern struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx); extern int tty_alloc_file(struct file *file); extern void tty_add_file(struct tty_struct *tty, struct file *file); extern void tty_free_file(struct file *file); extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); extern void tty_release_struct(struct tty_struct *tty, int idx); extern int tty_release(struct inode *inode, struct file *filp); extern void tty_init_termios(struct tty_struct *tty); extern void tty_save_termios(struct tty_struct *tty); extern int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty); extern struct mutex tty_mutex; #define tty_is_writelocked(tty) (mutex_is_locked(&tty->atomic_write_lock)) extern void tty_port_init(struct tty_port *port); extern void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index); extern struct device *tty_port_register_device(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device); extern struct device *tty_port_register_device_attr(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); extern struct device *tty_port_register_device_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device); extern struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); extern void tty_port_unregister_device(struct tty_port *port, struct tty_driver *driver, unsigned index); extern int tty_port_alloc_xmit_buf(struct tty_port *port); extern void tty_port_free_xmit_buf(struct tty_port *port); extern void tty_port_destroy(struct tty_port *port); extern void tty_port_put(struct tty_port *port); static inline struct tty_port *tty_port_get(struct tty_port *port) { if (port && kref_get_unless_zero(&port->kref)) return port; return NULL; } /* If the cts flow control is enabled, return true. */ static inline bool tty_port_cts_enabled(struct tty_port *port) { return test_bit(TTY_PORT_CTS_FLOW, &port->iflags); } static inline void tty_port_set_cts_flow(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_CTS_FLOW, &port->iflags); else clear_bit(TTY_PORT_CTS_FLOW, &port->iflags); } static inline bool tty_port_active(struct tty_port *port) { return test_bit(TTY_PORT_ACTIVE, &port->iflags); } static inline void tty_port_set_active(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_ACTIVE, &port->iflags); else clear_bit(TTY_PORT_ACTIVE, &port->iflags); } static inline bool tty_port_check_carrier(struct tty_port *port) { return test_bit(TTY_PORT_CHECK_CD, &port->iflags); } static inline void tty_port_set_check_carrier(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_CHECK_CD, &port->iflags); else clear_bit(TTY_PORT_CHECK_CD, &port->iflags); } static inline bool tty_port_suspended(struct tty_port *port) { return test_bit(TTY_PORT_SUSPENDED, &port->iflags); } static inline void tty_port_set_suspended(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_SUSPENDED, &port->iflags); else clear_bit(TTY_PORT_SUSPENDED, &port->iflags); } static inline bool tty_port_initialized(struct tty_port *port) { return test_bit(TTY_PORT_INITIALIZED, &port->iflags); } static inline void tty_port_set_initialized(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_INITIALIZED, &port->iflags); else clear_bit(TTY_PORT_INITIALIZED, &port->iflags); } static inline bool tty_port_kopened(struct tty_port *port) { return test_bit(TTY_PORT_KOPENED, &port->iflags); } static inline void tty_port_set_kopened(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_KOPENED, &port->iflags); else clear_bit(TTY_PORT_KOPENED, &port->iflags); } extern struct tty_struct *tty_port_tty_get(struct tty_port *port); extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty); extern int tty_port_carrier_raised(struct tty_port *port); extern void tty_port_raise_dtr_rts(struct tty_port *port); extern void tty_port_lower_dtr_rts(struct tty_port *port); extern void tty_port_hangup(struct tty_port *port); extern void tty_port_tty_hangup(struct tty_port *port, bool check_clocal); extern void tty_port_tty_wakeup(struct tty_port *port); extern int tty_port_block_til_ready(struct tty_port *port, struct tty_struct *tty, struct file *filp); extern int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct file *filp); extern void tty_port_close_end(struct tty_port *port, struct tty_struct *tty); extern void tty_port_close(struct tty_port *port, struct tty_struct *tty, struct file *filp); extern int tty_port_install(struct tty_port *port, struct tty_driver *driver, struct tty_struct *tty); extern int tty_port_open(struct tty_port *port, struct tty_struct *tty, struct file *filp); static inline int tty_port_users(struct tty_port *port) { return port->count + port->blocked_open; } extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc); extern int tty_unregister_ldisc(int disc); extern int tty_set_ldisc(struct tty_struct *tty, int disc); extern int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty); extern void tty_ldisc_release(struct tty_struct *tty); extern int __must_check tty_ldisc_init(struct tty_struct *tty); extern void tty_ldisc_deinit(struct tty_struct *tty); extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p, char *f, int count); /* n_tty.c */ extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops); #ifdef CONFIG_TTY extern void __init n_tty_init(void); #else static inline void n_tty_init(void) { } #endif /* tty_audit.c */ #ifdef CONFIG_AUDIT extern void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size); extern void tty_audit_exit(void); extern void tty_audit_fork(struct signal_struct *sig); extern void tty_audit_tiocsti(struct tty_struct *tty, char ch); extern int tty_audit_push(void); #else static inline void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size) { } static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch) { } static inline void tty_audit_exit(void) { } static inline void tty_audit_fork(struct signal_struct *sig) { } static inline int tty_audit_push(void) { return 0; } #endif /* tty_ioctl.c */ extern int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); /* vt.c */ extern int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); extern long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* tty_mutex.c */ /* functions for preparation of BKL removal */ extern void tty_lock(struct tty_struct *tty); extern int tty_lock_interruptible(struct tty_struct *tty); extern void tty_unlock(struct tty_struct *tty); extern void tty_lock_slave(struct tty_struct *tty); extern void tty_unlock_slave(struct tty_struct *tty); extern void tty_set_lock_subclass(struct tty_struct *tty); #ifdef CONFIG_PROC_FS extern void proc_tty_register_driver(struct tty_driver *); extern void proc_tty_unregister_driver(struct tty_driver *); #else static inline void proc_tty_register_driver(struct tty_driver *d) {} static inline void proc_tty_unregister_driver(struct tty_driver *d) {} #endif #define tty_msg(fn, tty, f, ...) \ fn("%s %s: " f, tty_driver_name(tty), tty_name(tty), ##__VA_ARGS__) #define tty_debug(tty, f, ...) tty_msg(pr_debug, tty, f, ##__VA_ARGS__) #define tty_info(tty, f, ...) tty_msg(pr_info, tty, f, ##__VA_ARGS__) #define tty_notice(tty, f, ...) tty_msg(pr_notice, tty, f, ##__VA_ARGS__) #define tty_warn(tty, f, ...) tty_msg(pr_warn, tty, f, ##__VA_ARGS__) #define tty_err(tty, f, ...) tty_msg(pr_err, tty, f, ##__VA_ARGS__) #define tty_info_ratelimited(tty, f, ...) \ tty_msg(pr_info_ratelimited, tty, f, ##__VA_ARGS__) #endif
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 // SPDX-License-Identifier: GPL-2.0-only /* * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. * * 10Apr2002 Andrew Morton * Initial version */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/percpu.h> #include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ #include <linux/pagevec.h> #include <linux/timer.h> #include <linux/sched/rt.h> #include <linux/sched/signal.h> #include <linux/mm_inline.h> #include <trace/events/writeback.h> #include "internal.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ #define MAX_PAUSE max(HZ/5, 1) /* * Try to keep balance_dirty_pages() call intervals higher than this many pages * by raising pause time to max_pause when falls below it. */ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* * Estimate write bandwidth at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) #define RATELIMIT_CALC_SHIFT 10 /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ static long ratelimit_pages = 32; /* The following parameters are exported via /proc/sys/vm */ /* * Start background writeback (via writeback threads) at this percentage */ int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ int vm_dirty_ratio = 20; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ EXPORT_SYMBOL_GPL(dirty_writeback_interval); /* * The longest time for which data is allowed to remain dirty */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* * Flag that makes the machine dump writes/reads and block dirtyings. */ int block_dump; /* * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ struct wb_domain global_wb_domain; /* consolidated parameters for balance_dirty_pages() and its subroutines */ struct dirty_throttle_control { #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *dom; struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ #endif struct bdi_writeback *wb; struct fprop_local_percpu *wb_completions; unsigned long avail; /* dirtyable */ unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; unsigned long wb_bg_thresh; unsigned long pos_ratio; }; /* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will * reflect changes in current writeout rate. */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) #ifdef CONFIG_CGROUP_WRITEBACK #define GDTC_INIT(__wb) .wb = (__wb), \ .dom = &global_wb_domain, \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB .dom = &global_wb_domain #define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ .dom = mem_cgroup_wb_domain(__wb), \ .wb_completions = &(__wb)->memcg_completions, \ .gdtc = __gdtc static bool mdtc_valid(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return mdtc->gdtc; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return &wb->memcg_completions; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { unsigned long this_bw = wb->avg_write_bandwidth; unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; /* * @wb may already be clean by the time control reaches here and * the total may not include its bw. */ if (this_bw < tot_bw) { if (min) { min *= this_bw; min = div64_ul(min, tot_bw); } if (max < 100) { max *= this_bw; max = div64_ul(max, tot_bw); } } *minp = min; *maxp = max; } #else /* CONFIG_CGROUP_WRITEBACK */ #define GDTC_INIT(__wb) .wb = (__wb), \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB #define MDTC_INIT(__wb, __gdtc) static bool mdtc_valid(struct dirty_throttle_control *dtc) { return false; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return &global_wb_domain; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return NULL; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return NULL; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { *minp = wb->bdi->min_ratio; *maxp = wb->bdi->max_ratio; } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect * lowmem and the ability to uphold the zone's watermarks without * requiring writeback. * * This number of dirtyable pages is the base value of which the * user-configurable dirty ratio is the effective number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * * Because the user is allowed to specify the dirty limit globally as * absolute number of bytes, calculating the per-zone dirty limit can * require translating the configured limit into a percentage of * global dirtyable memory first. */ /** * node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node * * Return: the node's number of pages potentially available for dirty * page cache. This is the base value for the per-node dirty limits. */ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) { unsigned long nr_pages = 0; int z; for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; nr_pages += zone_page_state(zone, NR_FREE_PAGES); } /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ nr_pages -= min(nr_pages, pgdat->totalreserve_pages); nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); return nr_pages; } static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; int i; for_each_node_state(node, N_HIGH_MEMORY) { for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { struct zone *z; unsigned long nr_pages; if (!is_highmem_idx(i)) continue; z = &NODE_DATA(node)->node_zones[i]; if (!populated_zone(z)) continue; nr_pages = zone_page_state(z, NR_FREE_PAGES); /* watch for underflows */ nr_pages -= min(nr_pages, high_wmark_pages(z)); nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); x += nr_pages; } } /* * Unreclaimable memory (kernel memory or anonymous memory * without swap) can bring down the dirtyable pages below * the zone's dirty balance reserve and the above calculation * will underflow. However we still want to add in nodes * which are below threshold (negative values) to get a more * accurate calculation but make sure that the total never * underflows. */ if ((long)x < 0) x = 0; /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure * that this does not occur. */ return min(x, total); #else return 0; #endif } /** * global_dirtyable_memory - number of globally dirtyable pages * * Return: the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ static unsigned long global_dirtyable_memory(void) { unsigned long x; x = global_zone_page_state(NR_FREE_PAGES); /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ x -= min(x, totalreserve_pages); x += global_node_page_state(NR_INACTIVE_FILE); x += global_node_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); return x + 1; /* Ensure that we never return 0 */ } /** * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain * @dtc: dirty_throttle_control of interest * * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The * dirty limits will be lifted by 1/4 for real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { const unsigned long available_memory = dtc->avail; struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; /* convert ratios to per-PAGE_SIZE for higher precision */ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; unsigned long thresh; unsigned long bg_thresh; struct task_struct *tsk; /* gdtc is !NULL iff @dtc is for memcg domain */ if (gdtc) { unsigned long global_avail = gdtc->avail; /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against * globally available memory. As the ratios are in * per-PAGE_SIZE, they can be obtained by dividing bytes by * number of pages. */ if (bytes) ratio = min(DIV_ROUND_UP(bytes, global_avail), PAGE_SIZE); if (bg_bytes) bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), PAGE_SIZE); bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else thresh = (ratio * available_memory) / PAGE_SIZE; if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; if (bg_thresh >= thresh) bg_thresh = thresh / 2; tsk = current; if (rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; /* we should eventually report the domain in the TP */ if (!gdtc) trace_global_dirty_state(bg_thresh, thresh); } /** * global_dirty_limits - background-writeback and dirty-throttling thresholds * @pbackground: out parameter for bg_thresh * @pdirty: out parameter for thresh * * Calculate bg_thresh and thresh for global_wb_domain. See * domain_dirty_limits() for details. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; gdtc.avail = global_dirtyable_memory(); domain_dirty_limits(&gdtc); *pbackground = gdtc.bg_thresh; *pdirty = gdtc.thresh; } /** * node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node * * Return: the maximum number of dirty pages allowed in a node, based * on the node's dirtyable memory. */ static unsigned long node_dirty_limit(struct pglist_data *pgdat) { unsigned long node_memory = node_dirtyable_memory(pgdat); struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * node_memory / global_dirtyable_memory(); else dirty = vm_dirty_ratio * node_memory / 100; if (rt_task(tsk)) dirty += dirty / 4; return dirty; } /** * node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check * * Return: %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ bool node_dirty_ok(struct pglist_data *pgdat) { unsigned long limit = node_dirty_limit(pgdat); unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; } int dirty_background_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret; } int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { writeback_set_ratelimit(); vm_dirty_bytes = 0; } return ret; } int dirty_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } static unsigned long wp_next_time(unsigned long cur_time) { cur_time += VM_COMPLETIONS_PERIOD_LEN; /* 0 has a special meaning... */ if (!cur_time) return 1; return cur_time; } static void wb_domain_writeout_inc(struct wb_domain *dom, struct fprop_local_percpu *completions, unsigned int max_prop_frac) { __fprop_inc_percpu_max(&dom->completions, completions, max_prop_frac); /* First event after period switching was turned off? */ if (unlikely(!dom->period_time)) { /* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ dom->period_time = wp_next_time(jiffies); mod_timer(&dom->period_timer, dom->period_time); } } /* * Increment @wb's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). */ static inline void __wb_writeout_inc(struct bdi_writeback *wb) { struct wb_domain *cgdom; inc_wb_stat(wb, WB_WRITTEN); wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); cgdom = mem_cgroup_wb_domain(wb); if (cgdom) wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), wb->bdi->max_prop_frac); } void wb_writeout_inc(struct bdi_writeback *wb) { unsigned long flags; local_irq_save(flags); __wb_writeout_inc(wb); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ static void writeout_period(struct timer_list *t) { struct wb_domain *dom = from_timer(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; if (fprop_new_period(&dom->completions, miss_periods + 1)) { dom->period_time = wp_next_time(dom->period_time + miss_periods * VM_COMPLETIONS_PERIOD_LEN); mod_timer(&dom->period_timer, dom->period_time); } else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ dom->period_time = 0; } } int wb_domain_init(struct wb_domain *dom, gfp_t gfp) { memset(dom, 0, sizeof(*dom)); spin_lock_init(&dom->lock); timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); dom->dirty_limit_tstamp = jiffies; return fprop_global_init(&dom->completions, gfp); } #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { del_timer_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. */ static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { min_ratio -= bdi->min_ratio; if (bdi_min_ratio + min_ratio < 100) { bdi_min_ratio += min_ratio; bdi->min_ratio += min_ratio; } else { ret = -EINVAL; } } spin_unlock_bh(&bdi_lock); return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { int ret = 0; if (max_ratio > 100) return -EINVAL; spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; } spin_unlock_bh(&bdi_lock); return ret; } EXPORT_SYMBOL(bdi_set_max_ratio); static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { return (thresh + bg_thresh) / 2; } static unsigned long hard_dirty_limit(struct wb_domain *dom, unsigned long thresh) { return max(thresh, dom->dirty_limit); } /* * Memory which can be further allocated to a memcg domain is capped by * system-wide clean memory excluding the amount being used in the domain. */ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, unsigned long filepages, unsigned long headroom) { struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); unsigned long clean = filepages - min(filepages, mdtc->dirty); unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); unsigned long other_clean = global_clean - min(global_clean, clean); mdtc->avail = filepages + min(headroom, other_clean); } /** * __wb_calc_thresh - @wb's share of dirty throttling threshold * @dtc: dirty_throttle_context of interest * * Note that balance_dirty_pages() will only seriously take it as a hard limit * when sleeping max_pause per page is not enough to keep the dirty pages under * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks * more (rather than completely block them) when the wb dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * * Return: @wb's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty and PG_writeback pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; u64 wb_thresh; unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* * Calculate this BDI's share of the thresh ratio. */ fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / 100; if (wb_thresh > (thresh * wb_max_ratio) / 100) wb_thresh = thresh * wb_max_ratio / 100; return wb_thresh; } unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb), .thresh = thresh }; return __wb_calc_thresh(&gdtc); } /* * setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ static long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, (limit - setpoint) | 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* * Dirty position control. * * (o) global/bdi setpoints * * We want the dirty pages be balanced around the global/wb setpoints. * When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. * * pos_ratio = 1 << RATELIMIT_CALC_SHIFT * * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * * if (wb_dirty < wb_setpoint) scale up pos_ratio * if (wb_dirty > wb_setpoint) scale down pos_ratio * * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * * (o) global control line * * ^ pos_ratio * | * | |<===== global dirty control scope ======>| * 2.0 .............* * | .* * | . * * | . * * | . * * | . * * | . * * 1.0 ................................* * | . . * * | . . * * | . . * * | . . * * | . . * * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * * (o) wb control line * * ^ pos_ratio * | * | * * | * * | * * | * * | * |<=========== span ============>| * 1.0 .......................* * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * 1/4 ...............................................* * * * * * * * * * * * * | . . * | . . * | . . * 0 +----------------------.-------------------------------.-------------> * wb_setpoint^ x_intercept^ * * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can * be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD * card's wb_dirty may rush to many times higher than wb_setpoint. * - the wb dirty thresh drops quickly due to change of JBOD workload */ static void wb_position_ratio(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long write_bw = wb->avg_write_bandwidth; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ unsigned long wb_setpoint; unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; dtc->pos_ratio = 0; if (unlikely(dtc->dirty >= limit)) return; /* * global setpoint * * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); /* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For * such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * * Here, in wb_position_ratio(), we calculate pos_ratio based on * two values: wb_dirty and wb_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is * about ~6K pages (as the average of background and throttle wb * limits). The 3rd order polynomial will provide positive feedback if * wb_dirty is under wb_setpoint and vice versa. * * Note, that we cannot use global counters in these calculations * because we want to throttle process writing to a strictlimit wb * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; if (dtc->wb_dirty < 8) { dtc->pos_ratio = min_t(long long, pos_ratio * 2, 2 << RATELIMIT_CALC_SHIFT); return; } if (dtc->wb_dirty >= wb_thresh) return; wb_setpoint = dirty_freerun_ceiling(wb_thresh, dtc->wb_bg_thresh); if (wb_setpoint == 0 || wb_setpoint == wb_thresh) return; wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, wb_thresh); /* * Typically, for strictlimit case, wb_setpoint << setpoint * and pos_ratio >> wb_pos_ratio. In the other words global * state ("dirty") is not limiting factor and we have to * make decision based on wb counters. But there is an * important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other * wb's) while given strictlimit wb is below limit. * * "pos_ratio * wb_pos_ratio" would work for the case above, * but it would look too non-natural for the case of all * activity in the system coming from a single strictlimit wb * with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 * (when globally we are at freerun and wb is well below wb * setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); return; } /* * We have computed basic pos_ratio above based on global situation. If * the wb is over/under its share of dirty pages, we want to scale * pos_ratio further down/up. That is done by the following mechanism. */ /* * wb setpoint * * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) * * x_intercept - wb_dirty * := -------------------------- * x_intercept - wb_setpoint * * The main wb control line is a linear function that subjects to * * (1) f(wb_setpoint) = 1.0 * (2) k = - 1 / (8 * write_bw) (in single wb case) * or equally: x_intercept = wb_setpoint + 8 * write_bw * * For single wb case, the dirty pages are observed to fluctuate * regularly within range * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] * for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its * own size, so move the slope over accordingly and choose a slope that * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. */ if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; /* * It's very possible that wb_thresh is close to 0 not because the * device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh */ x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); wb_setpoint = setpoint * (u64)x >> 16; /* * Use span=(8*write_bw) in single wb case as indicated by * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * * wb_thresh thresh - wb_thresh * span = --------- * (8 * write_bw) + ------------------ * wb_thresh * thresh thresh */ span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; x_intercept = wb_setpoint + span; if (dtc->wb_dirty < x_intercept - span / 4) { pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4; /* * wb reserve area, safeguard against dirty pool underrun and disk idle * It may push the desired control point of global dirty pages higher * than setpoint. */ x_intercept = wb_thresh / 2; if (dtc->wb_dirty < x_intercept) { if (dtc->wb_dirty > x_intercept / 8) pos_ratio = div_u64(pos_ratio * x_intercept, dtc->wb_dirty); else pos_ratio *= 8; } dtc->pos_ratio = pos_ratio; } static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written) { const unsigned long period = roundup_pow_of_two(3 * HZ); unsigned long avg = wb->avg_write_bandwidth; unsigned long old = wb->write_bandwidth; u64 bw; /* * bw = written * HZ / elapsed * * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period * * @written may have decreased due to account_page_redirty(). * Avoid underflowing @bw calculation. */ bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { bw = div64_ul(bw, elapsed); avg = bw; goto out; } bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); /* * one more level of smoothing, for filtering out sudden spikes */ if (avg > old && old >= (unsigned long)bw) avg -= (avg - old) >> 3; if (avg < old && old <= (unsigned long)bw) avg += (old - avg) >> 3; out: /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ avg = max(avg, 1LU); if (wb_has_dirty_io(wb)) { long delta = avg - wb->avg_write_bandwidth; WARN_ON_ONCE(atomic_long_add_return(delta, &wb->bdi->tot_write_bandwidth) <= 0); } wb->write_bandwidth = bw; wb->avg_write_bandwidth = avg; } static void update_dirty_limit(struct dirty_throttle_control *dtc) { struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; unsigned long limit = dom->dirty_limit; /* * Follow up in one step. */ if (limit < thresh) { limit = thresh; goto update; } /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce * dom->dirty_limit which is guaranteed to lie above the dirty pages. */ thresh = max(thresh, dtc->dirty); if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: dom->dirty_limit = limit; } static void domain_update_bandwidth(struct dirty_throttle_control *dtc, unsigned long now) { struct wb_domain *dom = dtc_dom(dtc); /* * check locklessly first to optimize away locking for the most time */ if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) return; spin_lock(&dom->lock); if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { update_dirty_limit(dtc); dom->dirty_limit_tstamp = now; } spin_unlock(&dom->lock); } /* * Maintain wb->dirty_ratelimit, the base dirty throttle rate. * * Normal wb tasks will be curbed at or below it in long term. * Obviously it should be around (write_bw / N) when there are N dd tasks. */ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long dirtied, unsigned long elapsed) { struct bdi_writeback *wb = dtc->wb; unsigned long dirty = dtc->dirty; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long setpoint = (freerun + limit) / 2; unsigned long write_bw = wb->avg_write_bandwidth; unsigned long dirty_ratelimit = wb->dirty_ratelimit; unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; /* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, * if there are N dd tasks, each throttled at task_ratelimit, the wb's * dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * * Note that the expanded form is not a pure rate feedback: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) * but also takes pos_ratio into account: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) * * (1) is not realistic because pos_ratio also takes part in balancing * the dirty rate. Consider the state * pos_ratio = 0.5 (3) * rate = 2 * (write_bw / N) (4) * If (1) is used, it will stuck in that state! Because each dd will * be throttled at * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) * yielding * dirty_rate = N * task_ratelimit = write_bw (6) * put (6) into (1) we get * rate_(i+1) = rate_(i) (7) * * So we end up using (2) to always keep * rate_(i+1) ~= (write_bw / N) (8) * regardless of the value of pos_ratio. As long as (8) is satisfied, * pos_ratio is able to drive itself to 1.0, which is not only where * the dirty count meet the setpoint, but also where the slope of * pos_ratio is most flat and hence task_ratelimit is least fluctuated. */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); /* * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw */ if (unlikely(balanced_dirty_ratelimit > write_bw)) balanced_dirty_ratelimit = write_bw; /* * We could safely do this and return immediately: * * wb->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated * code makes use of task_ratelimit to filter out singular points and * limit the step size. * * The below code essentially only uses the relative value of * * task_ratelimit - dirty_ratelimit * = (pos_ratio - 1) * dirty_ratelimit * * which reflects the direction and size of dirty position error. */ /* * dirty_ratelimit will follow balanced_dirty_ratelimit iff * task_ratelimit is on the same side of dirty_ratelimit, too. * For example, when * - dirty_ratelimit > balanced_dirty_ratelimit * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) * lowering dirty_ratelimit will help meet both the position and rate * control targets. Otherwise, don't update dirty_ratelimit if it will * only help meet the rate target. After all, what the users ultimately * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size * and filter out the singular points of balanced_dirty_ratelimit. Which * keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; /* * For strictlimit case, calculations above were based on wb counters * and limits (starting from pos_ratio = wb_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". * * We rampup dirty_ratelimit forcibly if wb_dirty is low because * it's possible that wb_thresh is close to zero due to inactivity * of backing device. */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; if (dtc->wb_dirty < 8) setpoint = dtc->wb_dirty + 1; else setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { x = min3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { x = max3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; } /* * Don't pursue 100% rate matching. It's impossible since the balanced * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step; wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); } static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, unsigned long start_time, bool update_ratelimit) { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; unsigned long elapsed = now - wb->bw_time_stamp; unsigned long dirtied; unsigned long written; lockdep_assert_held(&wb->list_lock); /* * rate-limit, only update once every 200ms. */ if (elapsed < BANDWIDTH_INTERVAL) return; dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); /* * Skip quiet periods when disk bandwidth is under-utilized. * (at least 1s idle time between two flusher runs) */ if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) goto snapshot; if (update_ratelimit) { domain_update_bandwidth(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* * @mdtc is always NULL if !CGROUP_WRITEBACK but the * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { domain_update_bandwidth(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } } wb_update_write_bandwidth(wb, elapsed, written); snapshot: wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; } void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; __wb_update_bandwidth(&gdtc, NULL, start_time, false); } /* * After a task dirtied this many pages, balance_dirty_pages_ratelimited() * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive * global_zone_page_state() too often. So scale it near-sqrt to the safety margin * (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, unsigned long thresh) { if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1); return 1; } static unsigned long wb_max_pause(struct bdi_writeback *wb, unsigned long wb_dirty) { unsigned long bw = wb->avg_write_bandwidth; unsigned long t; /* * Limit pause time for small memory systems. If sleeping for too long * time, a small pool of dirty/writeback pages may go empty and disk go * idle. * * 8 serves as the safety ratio. */ t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; return min_t(unsigned long, t, MAX_PAUSE); } static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause) { long hi = ilog2(wb->avg_write_bandwidth); long lo = ilog2(wb->dirty_ratelimit); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ /* target for 10ms pause on 1-dd case */ t = max(1, HZ / 100); /* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * * (N * 10ms) on 2^N concurrent tasks. */ if (hi > lo) t += (hi - lo) * (10 * HZ) / 1024; /* * This is a bit convoluted. We try to base the next nr_dirtied_pause * on the much more stable dirty_ratelimit. However the next pause time * will be computed based on task_ratelimit and the two rate limits may * depart considerably at some time. Especially if task_ratelimit goes * below dirty_ratelimit/2 and the target pause is max_pause, the next * pause time will be max_pause*2 _trimmed down_ to max_pause. As a * result task_ratelimit won't be executed faithfully, which could * eventually bring down dirty_ratelimit. * * We apply two rules to fix it up: * 1) try to estimate the next pause time and if necessary, use a lower * nr_dirtied_pause so as not to exceed max_pause. When this happens, * nr_dirtied_pause will be "dancing" with task_ratelimit. * 2) limit the target pause time to max_pause/2, so that the normal * small fluctuations of task_ratelimit won't trigger rule (1) and * nr_dirtied_pause will remain as stable as dirty_ratelimit. */ t = min(t, 1 + max_pause / 2); pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); /* * Tiny nr_dirtied_pause is found to hurt I/O performance in the test * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. * When the 16 consecutive reads are often interrupted by some dirty * throttling pause during the async writes, cfq will go into idles * (deadline is fine). So push nr_dirtied_pause as high as possible * until reaches DIRTY_POLL_THRESH=32 pages. */ if (pages < DIRTY_POLL_THRESH) { t = max_pause; pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); if (pages > DIRTY_POLL_THRESH) { pages = DIRTY_POLL_THRESH; t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; } } pause = HZ * pages / (task_ratelimit + 1); if (pause > max_pause) { t = max_pause; pages = task_ratelimit * t / roundup_pow_of_two(HZ); } *nr_dirtied_pause = pages; /* * The minimal pause time will normally be half the target pause time. */ return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long wb_reclaimable; /* * wb_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons * - in JBOD setup, wb_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow * go into state (wb_dirty >> wb_thresh) either because * wb_dirty starts high, or because wb_thresh drops low. * In this case we don't want to hard throttle the USB key * dirtiers for 100 seconds until wb_dirty drops under * wb_thresh. Instead the auxiliary wb control line in * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ dtc->wb_thresh = __wb_calc_thresh(dtc); dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ if (dtc->wb_thresh < 2 * wb_stat_error()) { wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); } } /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; struct dirty_throttle_control * const gdtc = &gdtc_stor; struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; unsigned long nr_reclaimable; /* = file_dirty */ long period; long pause; long max_pause; long min_pause; int nr_dirtied_pause; bool dirty_exceeded = false; unsigned long task_ratelimit; unsigned long dirty_ratelimit; struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { unsigned long now = jiffies; unsigned long dirty, thresh, bg_thresh; unsigned long m_dirty = 0; /* stop bogus uninit warnings */ unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); gdtc->avail = global_dirtyable_memory(); gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); domain_dirty_limits(gdtc); if (unlikely(strictlimit)) { wb_dirty_limits(gdtc); dirty = gdtc->wb_dirty; thresh = gdtc->wb_thresh; bg_thresh = gdtc->wb_bg_thresh; } else { dirty = gdtc->dirty; thresh = gdtc->thresh; bg_thresh = gdtc->bg_thresh; } if (mdtc) { unsigned long filepages, headroom, writeback; /* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); mdtc->dirty += writeback; mdtc_calc_avail(mdtc, filepages, headroom); domain_dirty_limits(mdtc); if (unlikely(strictlimit)) { wb_dirty_limits(mdtc); m_dirty = mdtc->wb_dirty; m_thresh = mdtc->wb_thresh; m_bg_thresh = mdtc->wb_bg_thresh; } else { m_dirty = mdtc->dirty; m_thresh = mdtc->thresh; m_bg_thresh = mdtc->bg_thresh; } } /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts * when the wb limits are ramping up in case of !strictlimit. * * In strictlimit case make decision based on the wb counters * and limits. Small writeouts when the wb limits are ramping * up are the price we consciously pay for strictlimit-ing. * * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. */ if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { unsigned long intv; unsigned long m_intv; free_running: intv = dirty_poll_interval(dirty, thresh); m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; if (mdtc) m_intv = dirty_poll_interval(m_dirty, m_thresh); current->nr_dirtied_pause = min(intv, m_intv); break; } if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); mem_cgroup_flush_foreign(wb); /* * Calculate global domain's pos_ratio and select the * global dtc by default. */ if (!strictlimit) { wb_dirty_limits(gdtc); if ((current->flags & PF_LOCAL_THROTTLE) && gdtc->wb_dirty < dirty_freerun_ceiling(gdtc->wb_thresh, gdtc->wb_bg_thresh)) /* * LOCAL_THROTTLE tasks must not be throttled * when below the per-wb freerun ceiling. */ goto free_running; } dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); wb_position_ratio(gdtc); sdtc = gdtc; if (mdtc) { /* * If memcg domain is in effect, calculate its * pos_ratio. @wb should satisfy constraints from * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ if (!strictlimit) { wb_dirty_limits(mdtc); if ((current->flags & PF_LOCAL_THROTTLE) && mdtc->wb_dirty < dirty_freerun_ceiling(mdtc->wb_thresh, mdtc->wb_bg_thresh)) /* * LOCAL_THROTTLE tasks must not be * throttled when below the per-wb * freerun ceiling. */ goto free_running; } dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); wb_position_ratio(mdtc); if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } if (dirty_exceeded && !wb->dirty_exceeded) wb->dirty_exceeded = 1; if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) { spin_lock(&wb->list_lock); __wb_update_bandwidth(gdtc, mdtc, start_time, true); spin_unlock(&wb->list_lock); } /* throttle according to the chosen dtc */ dirty_ratelimit = wb->dirty_ratelimit; task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = wb_max_pause(wb, sdtc->wb_dirty); min_pause = wb_min_pause(wb, max_pause, task_ratelimit, dirty_ratelimit, &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; pause = max_pause; goto pause; } period = HZ * pages_dirtied / task_ratelimit; pause = period; if (current->dirty_paused_when) pause -= now - current->dirty_paused_when; /* * For less than 1s think time (ext3/4 may block the dirtier * for up to 800ms from time to time on 1-HDD; so does xfs, * however at much less frequency), try to compensate it in * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ if (pause < min_pause) { trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, min(pause, 0L), start_time); if (pause < -HZ) { current->dirty_paused_when = now; current->nr_dirtied = 0; } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; } else if (current->nr_dirtied_pause <= pages_dirtied) current->nr_dirtied_pause += pages_dirtied; break; } if (unlikely(pause > max_pause)) { /* for occasional dropped task_ratelimit */ now += min(pause - max_pause, max_pause); pause = max_pause; } pause: trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, pause, start_time); __set_current_state(TASK_KILLABLE); wb->dirty_sleep = now; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; current->nr_dirtied = 0; current->nr_dirtied_pause = nr_dirtied_pause; /* * This is typically equal to (dirty < thresh) and can also * keep "1000+ dd on a slow USB stick" under control. */ if (task_ratelimit) break; /* * In the case of an unresponding NFS server and the NFS dirty * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * * In theory 1 page is enough to keep the consumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 * more page. However wb_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. */ if (sdtc->wb_dirty <= wb_stat_error()) break; if (fatal_signal_pending(current)) break; } if (!dirty_exceeded && wb->dirty_exceeded) wb->dirty_exceeded = 0; if (writeback_in_progress(wb)) return; /* * In laptop mode, we wait until hitting the higher threshold before * starting background writeout, and then write out all the way down * to the lower threshold. So slow writers cause minimal disk activity. * * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ if (laptop_mode) return; if (nr_reclaimable > gdtc->bg_thresh) wb_start_background_writeback(wb); } static DEFINE_PER_CPU(int, bdp_ratelimits); /* * Normal tasks are throttled by * loop { * dirty tsk->nr_dirtied_pause pages; * take a snap in balance_dirty_pages(); * } * However there is a worst case. If every task exit immediately when dirtied * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be * called to throttle the page dirties. The solution is to save the not yet * throttled page dirties in dirty_throttle_leaks on task exit and charge them * randomly into the running tasks. This works well for the above worst case, * as the new task will pick up and accumulate the old task's leaked dirty * count and eventually get throttled. */ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** * balance_dirty_pages_ratelimited - balance dirty memory state * @mapping: address_space which was dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * On really big machines, get_writeback_state is expensive, so try to avoid * calling it too often (ratelimiting). But once we're over the dirty memory * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; int ratelimit; int *p; if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) return; if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); if (!wb) wb = &bdi->wb; ratelimit = current->nr_dirtied_pause; if (wb->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); preempt_disable(); /* * This prevents one CPU to accumulate too many dirtied pages without * calling into balance_dirty_pages(), which can happen when there are * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. */ p = this_cpu_ptr(&bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) *p = 0; else if (unlikely(*p >= ratelimit_pages)) { *p = 0; ratelimit = 0; } /* * Pick up the dirtied pages by the exited tasks. This avoids lots of * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ p = this_cpu_ptr(&dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; } preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) balance_dirty_pages(wb, current->nr_dirtied); wb_put(wb); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); /** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's * clean enough. * * Return: %true if writeback should continue. */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; struct dirty_throttle_control * const gdtc = &gdtc_stor; struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; /* * Similar to balance_dirty_pages() but ignores pages being written * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); domain_dirty_limits(gdtc); if (gdtc->dirty > gdtc->bg_thresh) return true; if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) return true; if (mdtc) { unsigned long filepages, headroom, writeback; mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); mdtc_calc_avail(mdtc, filepages, headroom); domain_dirty_limits(mdtc); /* ditto, ignore writeback */ if (mdtc->dirty > mdtc->bg_thresh) return true; if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) return true; } return false; } /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; ret = proc_dointvec(table, write, buffer, length, ppos); /* * Writing 0 to dirty_writeback_interval will disable periodic writeback * and a different non-zero value will wakeup the writeback threads. * wb_wakeup_delayed() would be more appropriate, but it's a pain to * iterate over all bdis and wbs. * The reason we do this is to make the change take effect immediately. */ if (!ret && write && dirty_writeback_interval && dirty_writeback_interval != old_interval) wakeup_flusher_threads(WB_REASON_PERIODIC); return ret; } #ifdef CONFIG_BLOCK void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = from_timer(backing_dev_info, t, laptop_mode_wb_timer); wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); } /* * We've spun up the disk and we're in laptop mode: schedule writeback * of all dirty data a few seconds from now. If the flush is already scheduled * then push it back - the user is still using the disk. */ void laptop_io_completion(struct backing_dev_info *info) { mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); } /* * We're in laptop mode and we've just synced. The sync's writes will have * caused another writeback to be scheduled by laptop_io_completion. * Nothing needs to be written back anymore, so we unschedule the writeback. */ void laptop_sync_completion(void) { struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) del_timer(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); } #endif /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. * If it is too low then SMP machines will call the (expensive) * get_writeback_state too often. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory * thresholds. */ void writeback_set_ratelimit(void) { struct wb_domain *dom = &global_wb_domain; unsigned long background_thresh; unsigned long dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); dom->dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; } static int page_writeback_cpu_online(unsigned int cpu) { writeback_set_ratelimit(); return 0; } /* * Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory * related to pages that could be allocated for buffers. * * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" * is now applied to total non-HIGHPAGE memory, and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. * * But we might still want to scale the dirty_ratio by how * much memory the box has.. */ void __init page_writeback_init(void) { BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); } /** * tag_pages_for_writeback - tag pages to be written by write_cache_pages * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is * that write_cache_pages (or whoever calls this function) will then use * TOWRITE tag to identify pages eligible for writeback. This mechanism is * used to avoid livelocking of writeback by a process steadily creating new * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); unsigned int tagged = 0; void *page; xas_lock_irq(&xas); xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); if (++tagged % XA_CHECK_SCHED) continue; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); } EXPORT_SYMBOL(tag_pages_for_writeback); /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @writepage: function called for each page * @data: data passed to writepage function * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. * * To avoid livelocks (when other process dirties new pages), we first tag * pages which should be written back with TOWRITE tag and only then start * writing them. For data-integrity sync we have to be careful so that we do * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). * * To avoid deadlocks between range_cyclic writeback and callers that hold * pages in PageWriteback to aggregate IO until write_cache_pages() returns, * we do not loop back to the start of the file. Doing so causes a page * lock/page writeback access order inversion - we should only ever lock * multiple pages in ascending page->index order, and looping back to the start * of the file violates that rule and causes deadlocks. * * Return: %0 on success, negative error code otherwise */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { int ret = 0; int done = 0; int error; struct pagevec pvec; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int range_whole = 0; xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { index = mapping->writeback_index; /* prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { tag_pages_for_writeback(mapping, index, end); tag = PAGECACHE_TAG_TOWRITE; } else { tag = PAGECACHE_TAG_DIRTY; } done_index = index; while (!done && (index <= end)) { int i; nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, tag); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; done_index = page->index; lock_page(page); /* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ if (unlikely(page->mapping != mapping)) { continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); error = (*writepage)(page, wbc, data); if (unlikely(error)) { /* * Handle errors according to the type of * writeback. There's no need to continue for * background writeback. Just push done_index * past this page so media errors won't choke * writeout for the entire file. For integrity * writeback, we must process the entire dirty * set regardless of errors because the fs may * still have state to clear for each page. In * that case we continue processing and return * the first error. */ if (error == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); error = 0; } else if (wbc->sync_mode != WB_SYNC_ALL) { ret = error; done_index = page->index + 1; done = 1; break; } if (!ret) ret = error; } /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } } pagevec_release(&pvec); cond_resched(); } /* * If we hit the last page and there is more work to be done: wrap * back the index back to the start of the file for the next * time we are called. */ if (wbc->range_cyclic && !done) done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; } EXPORT_SYMBOL(write_cache_pages); /* * Function used by generic_writepages to call the real writepage * function and set the mapping flags on error */ static int __writepage(struct page *page, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; int ret = mapping->a_ops->writepage(page, wbc); mapping_set_error(mapping, ret); return ret; } /** * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * * This is a library function, which implements the writepages() * address_space_operation. * * Return: %0 on success, negative error code otherwise */ int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct blk_plug plug; int ret; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, __writepage, mapping); blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(generic_writepages); int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; if (wbc->nr_to_write <= 0) return 0; while (1) { if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); else ret = generic_writepages(mapping, wbc); if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) break; cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } return ret; } /** * write_one_page - write out a single page and wait on I/O * @page: the page to write * * The page must be locked by the caller and will be unlocked upon return. * * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this * function returns. * * Return: %0 on success, negative error code otherwise */ int write_one_page(struct page *page) { struct address_space *mapping = page->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; BUG_ON(!PageLocked(page)); wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { get_page(page); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0) wait_on_page_writeback(page); put_page(page); } else { unlock_page(page); } if (!ret) ret = filemap_check_errors(mapping); return ret; } EXPORT_SYMBOL(write_one_page); /* * For address_spaces which do not use buffers nor write back. */ int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) return !TestSetPageDirty(page); return 0; } /* * Helper function for set_page_dirty family. * * Caller must hold lock_page_memcg(). * * NOTE: This relies on being atomic wrt interrupts. */ void account_page_dirtied(struct page *page, struct address_space *mapping) { struct inode *inode = mapping->host; trace_writeback_dirty_page(page, mapping); if (mapping_can_writeback(mapping)) { struct bdi_writeback *wb; inode_attach_wb(inode, page); wb = inode_to_wb(inode); __inc_lruvec_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); inc_wb_stat(wb, WB_RECLAIMABLE); inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); mem_cgroup_track_foreign_dirty(page, wb); } } /* * Helper function for deaccounting dirty page without writeback. * * Caller must hold lock_page_memcg(). */ void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb) { if (mapping_can_writeback(mapping)) { dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_SIZE); } } /* * For address_spaces which do not use buffers. Just tag the page as dirty in * the xarray. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. * * The caller must ensure this doesn't race with truncation. Most will simply * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and * the pte lock held, which also locks out truncation. */ int __set_page_dirty_nobuffers(struct page *page) { lock_page_memcg(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; if (!mapping) { unlock_page_memcg(page); return 1; } xa_lock_irqsave(&mapping->i_pages, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); __xa_set_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } unlock_page_memcg(page); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* * Call this whenever redirtying a page, to de-account the dirty counters * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to * systematic errors in balanced_dirty_ratelimit and the dirty pages position * control. */ void account_page_redirty(struct page *page) { struct address_space *mapping = page->mapping; if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied--; dec_node_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); unlocked_inode_to_wb_end(inode, &cookie); } } EXPORT_SYMBOL(account_page_redirty); /* * When a writepage implementation decides that it doesn't want to write this * page for some reason, it should redirty the locked page via * redirty_page_for_writepage() and it should then unlock the page and return 0 */ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { int ret; wbc->pages_skipped++; ret = __set_page_dirty_nobuffers(page); account_page_redirty(page); return ret; } EXPORT_SYMBOL(redirty_page_for_writepage); /* * Dirty a page. * * For pages with a mapping this should be done under the page lock * for the benefit of asynchronous memory errors who prefer a consistent * dirty state. This rule can be broken in some special cases, * but should be better not to. * * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ int set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); page = compound_head(page); if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; /* * readahead/lru_deactivate_page could remain * PG_readahead/PG_reclaim due to race with end_page_writeback * About readahead, if the page is written, the flags would be * reset. So no problem. * About lru_deactivate_page, if the page is redirty, the flag * will be reset. So no problem. but if the page is used by readahead * it will confuse readahead and make it restart the size rampup * process. But it's a trivial problem. */ if (PageReclaim(page)) ClearPageReclaim(page); #ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; #endif return (*spd)(page); } if (!PageDirty(page)) { if (!TestSetPageDirty(page)) return 1; } return 0; } EXPORT_SYMBOL(set_page_dirty); /* * set_page_dirty() is racy if the caller has no reference against * page->mapping->host, and if the page is unlocked. This is because another * CPU could truncate the page off the mapping and then free the mapping. * * Usually, the page _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * * In other cases, the page should be locked before running set_page_dirty(). */ int set_page_dirty_lock(struct page *page) { int ret; lock_page(page); ret = set_page_dirty(page); unlock_page(page); return ret; } EXPORT_SYMBOL(set_page_dirty_lock); /* * This cancels just the dirty bit on the kernel page itself, it does NOT * actually remove dirty bits on any mmap's that may be around. It also * leaves the page tagged dirty, so any sync activity will still find it on * the dirty lists, and in particular, clear_page_dirty_for_io() will still * look at the dirty bits in the VM. * * Doing this should *normally* only ever be done when a page is truncated, * and is not actually mapped anywhere at all. However, fs/buffer.c does * this when it notices that somebody has cleaned out all the buffers on a * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ void __cancel_dirty_page(struct page *page) { struct address_space *mapping = page_mapping(page); if (mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, wb); unlocked_inode_to_wb_end(inode, &cookie); unlock_page_memcg(page); } else { ClearPageDirty(page); } } EXPORT_SYMBOL(__cancel_dirty_page); /* * Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page * tagged as dirty in the xarray so that a concurrent write-for-sync * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), * at which stage we bring the page's dirty flag and xarray dirty tag * back into sync. * * This incoherency between the page's dirty flag and xarray tag is * unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); int ret = 0; VM_BUG_ON_PAGE(!PageLocked(page), page); if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. * * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to * mark the whole page dirty if it was * dirty in a pagetable. Only to then * (c) clean the page again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * * Note! Normally the "set_page_dirty(page)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * * We basically use the page "master dirty bit" * as a serialization point for all the different * threads doing their things. */ if (page_mkclean(page)) set_page_dirty(page); /* * We carefully synchronise fault handlers against * installing a dirty pte and marking the page dirty * at this point. We do this by having them hold the * page lock while dirtying the page, and pages are * always locked coming in here, so we get the desired * exclusion. */ wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) { dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } unlocked_inode_to_wb_end(inode, &cookie); return ret; } return TestClearPageDirty(page); } EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); struct mem_cgroup *memcg; struct lruvec *lruvec; int ret; memcg = lock_page_memcg(page); lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); dec_wb_stat(wb, WB_WRITEBACK); __wb_writeout_inc(wb); } } if (mapping->host && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = TestClearPageWriteback(page); } if (ret) { dec_lruvec_state(lruvec, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } __unlock_page_memcg(memcg); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); int ret, access_ret; lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, page_index(page)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; xas_lock_irqsave(&xas, flags); xas_load(&xas); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); /* * We can come through here when swapping anonymous * pages, so we don't necessarily have an inode to track * for sync. */ if (mapping->host && !on_wblist) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); } else { ret = TestSetPageWriteback(page); } if (!ret) { inc_lruvec_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } unlock_page_memcg(page); access_ret = arch_make_page_accessible(page); /* * If writeback has been triggered on a page that cannot be made * accessible, it is too late to recover here. */ VM_BUG_ON_PAGE(access_ret != 0, page); return ret; } EXPORT_SYMBOL(__test_set_page_writeback); /* * Wait for a page to complete writeback */ void wait_on_page_writeback(struct page *page) { while (PageWriteback(page)) { trace_wait_on_page_writeback(page, page_mapping(page)); wait_on_page_bit(page, PG_writeback); } } EXPORT_SYMBOL_GPL(wait_on_page_writeback); /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. * * This function determines if the given page is related to a backing device * that requires page contents to be held stable during writeback. If so, then * it will wait for any pending writeback to complete. */ void wait_for_stable_page(struct page *page) { page = thp_head(page); if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) wait_on_page_writeback(page); } EXPORT_SYMBOL_GPL(wait_for_stable_page);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the UDP module. * * Version: @(#)udp.h 1.0.2 05/07/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Alan Cox : Turned on udp checksums. I don't want to * chase 'memory corruption' bugs that aren't! */ #ifndef _UDP_H #define _UDP_H #include <linux/list.h> #include <linux/bug.h> #include <net/inet_sock.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> #include <linux/ipv6.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/indirect_call_wrapper.h> /** * struct udp_skb_cb - UDP(-Lite) private variables * * @header: private variables used by IPv4/IPv6 * @cscov: checksum coverage length (UDP-Lite only) * @partial_cov: if set indicates partial csum coverage */ struct udp_skb_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; __u16 cscov; __u8 partial_cov; }; #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** * struct udp_hslot - UDP hash slot * * @head: head of list of sockets * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { struct hlist_head head; int count; spinlock_t lock; } __attribute__((aligned(2 * sizeof(long)))); /** * struct udp_table - UDP table * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; struct udp_hslot *hash2; unsigned int mask; unsigned int log; }; extern struct udp_table udp_table; void udp_table_init(struct udp_table *, const char *); static inline struct udp_hslot *udp_hashslot(struct udp_table *table, struct net *net, unsigned int num) { return &table->hash[udp_hashfn(net, num, table->mask)]; } /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() */ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { return &table->hash2[hash & table->mask]; } extern struct proto udp_prot; extern atomic_long_t udp_memory_allocated; /* sysctl variables for udp */ extern long sysctl_udp_mem[3]; extern int sysctl_udp_rmem_min; extern int sysctl_udp_wmem_min; struct sk_buff; /* * Generic checksumming routines for UDP(-Lite) v4 and v6 */ static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb) { return (UDP_SKB_CB(skb)->cscov == skb->len ? __skb_checksum_complete(skb) : __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov)); } static inline int udp_lib_checksum_complete(struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && __udp_lib_checksum_complete(skb); } /** * udp_csum_outgoing - compute UDPv4/v6 checksum over fragments * @sk: socket we are writing to * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) */ static inline __wsum udp_csum_outgoing(struct sock *sk, struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), 0); skb_queue_walk(&sk->sk_write_queue, skb) { csum = csum_add(csum, skb->csum); } return csum; } static inline __wsum udp_csum(struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), skb->csum); for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) { csum = csum_add(csum, skb->csum); } return csum; } static inline __sum16 udp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); } void udp_set_csum(bool nocheck, struct sk_buff *skb, __be32 saddr, __be32 daddr, int len); static inline void udp_csum_pull_header(struct sk_buff *skb) { if (!skb->csum_valid && skb->ip_summed == CHECKSUM_NONE) skb->csum = csum_partial(skb->data, sizeof(struct udphdr), skb->csum); skb_pull_rcsum(skb, sizeof(struct udphdr)); UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr); } typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport, __be16 dport); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { struct udphdr *uh; unsigned int hlen, off; off = skb_gro_offset(skb); hlen = off + sizeof(*uh); uh = skb_gro_header_fast(skb, off); if (skb_gro_header_hard(skb, hlen)) uh = skb_gro_header_slow(skb, hlen, off); return uh; } /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { BUG(); return 0; } void udp_lib_unhash(struct sock *sk); void udp_lib_rehash(struct sock *sk, u16 new_hash); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); u32 udp_flow_hashrnd(void); static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, int min, int max, bool use_eth) { u32 hash; if (min >= max) { /* Use default range */ inet_get_local_port_range(net, &min, &max); } hash = skb_get_hash(skb); if (unlikely(!hash)) { if (use_eth) { /* Can't find a normal hash, caller has indicated an * Ethernet packet so use that to compute a hash. */ hash = jhash(skb->data, 2 * ETH_ALEN, (__force u32) skb->protocol); } else { /* Can't derive any sort of hash for the packet, set * to some consistent random value. */ hash = udp_flow_hashrnd(); } } /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an * attacker is leaked. Only upper 16 bits are relevant in the * computation for 16 bit port value. */ hash ^= hash << 16; return htons((((u64) hash * (max - min)) >> 32) + min); } static inline int udp_rqueue_get(struct sock *sk) { return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) return inet_bound_dev_eq(!!net->ipv4.sysctl_udp_l3mdev_accept, bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); #endif } /* net/ipv4/udp.c */ void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *off, int *err); static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *err) { int off = 0; return __skb_recv_udp(sk, flags, noblock, &off, err); } int udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6); int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)); struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport); struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif); struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() */ struct udp_dev_scratch { /* skb->truesize and the stateless bit are embedded in a single field; * do not use a bitfield since the compiler emits better/smaller code * this way */ u32 _tsize_state; #if BITS_PER_LONG == 64 /* len and the bit needed to compute skb_csum_unnecessary * will be on cold cache lines at recvmsg time. * skb->len can be stored on 16 bits since the udp header has been * already validated and pulled. */ u16 len; bool is_linear; bool csum_unnecessary; #endif }; static inline struct udp_dev_scratch *udp_skb_scratch(struct sk_buff *skb) { return (struct udp_dev_scratch *)&skb->dev_scratch; } #if BITS_PER_LONG == 64 static inline unsigned int udp_skb_len(struct sk_buff *skb) { return udp_skb_scratch(skb)->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return udp_skb_scratch(skb)->csum_unnecessary; } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return udp_skb_scratch(skb)->is_linear; } #else static inline unsigned int udp_skb_len(struct sk_buff *skb) { return skb->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return skb_csum_unnecessary(skb); } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return !skb_is_nonlinear(skb); } #endif static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { int n; n = copy_to_iter(skb->data + off, len, to); if (n == len) return 0; iov_iter_revert(to, n); return -EFAULT; } /* * SNMP statistics for UDP and UDP-Lite */ #define UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else __SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP6_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);\ else __SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #define UDP6_INC_STATS(net, field, __lite) do { \ if (__lite) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field); \ else SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #if IS_ENABLED(CONFIG_IPV6) #define __UDPX_MIB(sk, ipv4) \ ({ \ ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics) : \ (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 : \ sock_net(sk)->mib.udp_stats_in6); \ }) #else #define __UDPX_MIB(sk, ipv4) \ ({ \ IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics; \ }) #endif #define __UDPX_INC_STATS(sk, field) \ __SNMP_INC_STATS(__UDPX_MIB(sk, (sk)->sk_family == AF_INET), field) #ifdef CONFIG_PROC_FS struct udp_seq_afinfo { sa_family_t family; struct udp_table *udp_table; }; struct udp_iter_state { struct seq_net_private p; int bucket; struct udp_seq_afinfo *bpf_seq_afinfo; }; void *udp_seq_start(struct seq_file *seq, loff_t *pos); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void udp_seq_stop(struct seq_file *seq, void *v); extern const struct seq_operations udp_seq_ops; extern const struct seq_operations udp6_seq_ops; int udp4_proc_init(void); void udp4_proc_exit(void); #endif /* CONFIG_PROC_FS */ int udpv4_offload_init(void); void udp_init(void); DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void); #if IS_ENABLED(CONFIG_IPV6) DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void); #endif static inline struct sk_buff *udp_rcv_segment(struct sock *sk, struct sk_buff *skb, bool ipv4) { netdev_features_t features = NETIF_F_SG; struct sk_buff *segs; /* Avoid csum recalculation by skb_segment unless userspace explicitly * asks for the final checksum values */ if (!inet_get_convert_csum(sk)) features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; /* UDP segmentation expects packets of type CHECKSUM_PARTIAL or * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial * packets in udp_gro_complete_segment. As does UDP GSO, verified by * udp_send_skb. But when those packets are looped in dev_loopback_xmit * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY. * Reset in this specific case, where PARTIAL is both correct and * required. */ if (skb->pkt_type == PACKET_LOOPBACK) skb->ip_summed = CHECKSUM_PARTIAL; /* the GSO CB lays after the UDP one, no need to save and restore any * CB fragment */ segs = __skb_gso_segment(skb, features, false); if (IS_ERR_OR_NULL(segs)) { int segs_nr = skb_shinfo(skb)->gso_segs; atomic_add(segs_nr, &sk->sk_drops); SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, segs_nr); kfree_skb(skb); return NULL; } consume_skb(skb); return segs; } #ifdef CONFIG_BPF_STREAM_PARSER struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); #endif /* BPF_STREAM_PARSER */ #endif /* _UDP_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1997 Linus Torvalds * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) */ #include <linux/export.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/backing-dev.h> #include <linux/hash.h> #include <linux/swap.h> #include <linux/security.h> #include <linux/cdev.h> #include <linux/memblock.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/mount.h> #include <linux/posix_acl.h> #include <linux/prefetch.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> #include <linux/list_lru.h> #include <linux/iversion.h> #include <trace/events/writeback.h> #include "internal.h" /* * Inode locking rules: * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget() * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode->i_sb->s_inode_list_lock protects: * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * * Lock ordering: * * inode->i_sb->s_inode_list_lock * inode->i_lock * Inode LRU list locks * * bdi->wb.list_lock * inode->i_lock * * inode_hash_lock * inode->i_sb->s_inode_list_lock * inode->i_lock * * iunique_lock * inode_hash_lock */ static unsigned int i_hash_mask __read_mostly; static unsigned int i_hash_shift __read_mostly; static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. */ const struct address_space_operations empty_aops = { }; EXPORT_SYMBOL(empty_aops); /* * Statistics gathering.. */ struct inodes_stat_t inodes_stat; static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __read_mostly; static long get_nr_inodes(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } static inline long get_nr_inodes_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } /* * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL int proc_nr_inodes(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif static int no_open(struct inode *inode, struct file *file) { return -ENXIO; } /** * inode_init_always - perform inode structure initialisation * @sb: superblock inode belongs to * @inode: inode to initialise * * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. */ int inode_init_always(struct super_block *sb, struct inode *inode) { static const struct inode_operations empty_iops; static const struct file_operations no_open_fops = {.open = no_open}; struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; inode->__i_nlink = 1; inode->i_opflags = 0; if (sb->s_xattr) inode->i_opflags |= IOP_XATTR; i_uid_write(inode, 0); i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; inode->i_pipe = NULL; inode->i_bdev = NULL; inode->i_cdev = NULL; inode->i_link = NULL; inode->i_dir_seq = 0; inode->i_rdev = 0; inode->dirtied_when = 0; #ifdef CONFIG_CGROUP_WRITEBACK inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0; #endif if (security_inode_alloc(inode)) goto out; spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); atomic_set(&inode->i_dio_count, 0); mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; if (sb->s_type->fs_flags & FS_THP_SUPPORT) __set_bit(AS_THP_SUPPORT, &mapping->flags); mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); #ifdef CONFIG_READ_ONLY_THP_FOR_FS atomic_set(&mapping->nr_thps, 0); #endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->writeback_index = 0; inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; #endif #ifdef CONFIG_FSNOTIFY inode->i_fsnotify_mask = 0; #endif inode->i_flctx = NULL; this_cpu_inc(nr_inodes); return 0; out: return -ENOMEM; } EXPORT_SYMBOL(inode_init_always); void free_inode_nonrcu(struct inode *inode) { kmem_cache_free(inode_cachep, inode); } EXPORT_SYMBOL(free_inode_nonrcu); static void i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); if (inode->free_inode) inode->free_inode(inode); else free_inode_nonrcu(inode); } static struct inode *alloc_inode(struct super_block *sb) { const struct super_operations *ops = sb->s_op; struct inode *inode; if (ops->alloc_inode) inode = ops->alloc_inode(sb); else inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); if (!inode) return NULL; if (unlikely(inode_init_always(sb, inode))) { if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return NULL; } inode->free_inode = ops->free_inode; i_callback(&inode->i_rcu); return NULL; } return inode; } void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); locks_free_lock_context(inode); if (!inode->i_nlink) { WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); atomic_long_dec(&inode->i_sb->s_remove_count); } #ifdef CONFIG_FS_POSIX_ACL if (inode->i_acl && !is_uncached_acl(inode->i_acl)) posix_acl_release(inode->i_acl); if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) posix_acl_release(inode->i_default_acl); #endif this_cpu_dec(nr_inodes); } EXPORT_SYMBOL(__destroy_inode); static void destroy_inode(struct inode *inode) { const struct super_operations *ops = inode->i_sb->s_op; BUG_ON(!list_empty(&inode->i_lru)); __destroy_inode(inode); if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return; } inode->free_inode = ops->free_inode; call_rcu(&inode->i_rcu, i_callback); } /** * drop_nlink - directly drop an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. In cases * where we are attempting to track writes to the * filesystem, a decrement to zero means an imminent * write when the file is truncated and actually unlinked * on the filesystem. */ void drop_nlink(struct inode *inode) { WARN_ON(inode->i_nlink == 0); inode->__i_nlink--; if (!inode->i_nlink) atomic_long_inc(&inode->i_sb->s_remove_count); } EXPORT_SYMBOL(drop_nlink); /** * clear_nlink - directly zero an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. See * drop_nlink() for why we care about i_nlink hitting zero. */ void clear_nlink(struct inode *inode) { if (inode->i_nlink) { inode->__i_nlink = 0; atomic_long_inc(&inode->i_sb->s_remove_count); } } EXPORT_SYMBOL(clear_nlink); /** * set_nlink - directly set an inode's link count * @inode: inode * @nlink: new nlink (should be non-zero) * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. */ void set_nlink(struct inode *inode, unsigned int nlink) { if (!nlink) { clear_nlink(inode); } else { /* Yes, some filesystems do change nlink from zero to one */ if (inode->i_nlink == 0) atomic_long_dec(&inode->i_sb->s_remove_count); inode->__i_nlink = nlink; } } EXPORT_SYMBOL(set_nlink); /** * inc_nlink - directly increment an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. Currently, * it is only here for parity with dec_nlink(). */ void inc_nlink(struct inode *inode) { if (unlikely(inode->i_nlink == 0)) { WARN_ON(!(inode->i_state & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count); } inode->__i_nlink++; } EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT_CACHED; } void address_space_init_once(struct address_space *mapping) { memset(mapping, 0, sizeof(*mapping)); __address_space_init_once(mapping); } EXPORT_SYMBOL(address_space_init_once); /* * These are initializations that only need to be done * once, because the fields are idempotent across use * of the inode, so let the slab aware of that. */ void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } EXPORT_SYMBOL(inode_init_once); static void init_once(void *foo) { struct inode *inode = (struct inode *) foo; inode_init_once(inode); } /* * inode->i_lock must be held */ void __iget(struct inode *inode) { atomic_inc(&inode->i_count); } /* * get additional reference to inode; caller must already hold one. */ void ihold(struct inode *inode) { WARN_ON(atomic_inc_return(&inode->i_count) < 2); } EXPORT_SYMBOL(ihold); static void inode_lru_list_add(struct inode *inode) { if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else inode->i_state |= I_REFERENCED; } /* * Add inode to LRU if needed (inode is unused and clean). * * Needs inode->i_lock held. */ void inode_add_lru(struct inode *inode) { if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) && !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE) inode_lru_list_add(inode); } static void inode_lru_list_del(struct inode *inode) { if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } /** * inode_sb_list_add - add inode to the superblock list of inodes * @inode: inode to add */ void inode_sb_list_add(struct inode *inode) { spin_lock(&inode->i_sb->s_inode_list_lock); list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); spin_unlock(&inode->i_sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { if (!list_empty(&inode->i_sb_list)) { spin_lock(&inode->i_sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); spin_unlock(&inode->i_sb->s_inode_list_lock); } } static unsigned long hash(struct super_block *sb, unsigned long hashval) { unsigned long tmp; tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / L1_CACHE_BYTES; tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); return tmp & i_hash_mask; } /** * __insert_inode_hash - hash an inode * @inode: unhashed inode * @hashval: unsigned long value used to locate this object in the * inode_hashtable. * * Add an inode to the inode hash for this superblock. */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_add_head_rcu(&inode->i_hash, b); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__insert_inode_hash); /** * __remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * * Remove an inode from the superblock. */ void __remove_inode_hash(struct inode *inode) { spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_del_init_rcu(&inode->i_hash); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__remove_inode_hash); void clear_inode(struct inode *inode) { /* * We have to cycle the i_pages lock here because reclaim can be in the * process of removing the last page (in __delete_from_page_cache()) * and we must not free the mapping under it. */ xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); BUG_ON(inode->i_data.nrexceptional); xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } EXPORT_SYMBOL(clear_inode); /* * Free the inode passed in, removing it from the lists it is still connected * to. We remove any pages still attached to the inode and wait for any IO that * is still in progress before finally destroying the inode. * * An inode must already be marked I_FREEING so that we avoid the inode being * moved back onto lists if we race with other code that manipulates the lists * (e.g. writeback_single_inode). The caller is responsible for setting this. * * An inode must already be removed from the LRU list before being evicted from * the cache. This should occur atomically with setting the I_FREEING state * flag, so no inodes here should ever be on the LRU when being evicted. */ static void evict(struct inode *inode) { const struct super_operations *op = inode->i_sb->s_op; BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); if (!list_empty(&inode->i_io_list)) inode_io_list_del(inode); inode_sb_list_del(inode); /* * Wait for flusher thread to be done with the inode so that filesystem * does not start destroying it while writeback is still running. Since * the inode has I_FREEING set, flusher thread won't start new work on * the inode. We just have to wait for running writeback to finish. */ inode_wait_for_writeback(inode); if (op->evict_inode) { op->evict_inode(inode); } else { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } if (S_ISBLK(inode->i_mode) && inode->i_bdev) bd_forget(inode); if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); remove_inode_hash(inode); spin_lock(&inode->i_lock); wake_up_bit(&inode->i_state, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); destroy_inode(inode); } /* * dispose_list - dispose of the contents of a local list * @head: the head of the list to free * * Dispose-list gets a local list with local inodes in it, so it doesn't * need to worry about list corruption and SMP locks. */ static void dispose_list(struct list_head *head) { while (!list_empty(head)) { struct inode *inode; inode = list_first_entry(head, struct inode, i_lru); list_del_init(&inode->i_lru); evict(inode); cond_resched(); } } /** * evict_inodes - evict all evictable inodes for a superblock * @sb: superblock to operate on * * Make sure that no inodes with zero refcount are retained. This is * called by superblock shutdown after having SB_ACTIVE flag removed, * so any inode reaching zero refcount during or after that call will * be immediately evicted. */ void evict_inodes(struct super_block *sb) { struct inode *inode, *next; LIST_HEAD(dispose); again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; } inode->i_state |= I_FREEING; inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); /* * We can have a ton of inodes to evict at unmount time given * enough memory, check to see if we need to go to sleep for a * bit so we don't livelock. */ if (need_resched()) { spin_unlock(&sb->s_inode_list_lock); cond_resched(); dispose_list(&dispose); goto again; } } spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on * @kill_dirty: flag to guide handling of dirty inodes * * Attempts to free all inodes for a given superblock. If there were any * busy inodes return a non-zero value, else zero. * If @kill_dirty is set, discard dirty inodes too, otherwise treat * them as busy. */ int invalidate_inodes(struct super_block *sb, bool kill_dirty) { int busy = 0; struct inode *inode, *next; LIST_HEAD(dispose); again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; } if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { spin_unlock(&inode->i_lock); busy = 1; continue; } if (atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); busy = 1; continue; } inode->i_state |= I_FREEING; inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); if (need_resched()) { spin_unlock(&sb->s_inode_list_lock); cond_resched(); dispose_list(&dispose); goto again; } } spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); return busy; } /* * Isolate the inode from the LRU in preparation for freeing it. * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. If the inode has metadata buffers attached to * mapping->private_list then try to remove them. * * If the inode has the I_REFERENCED flag set, then it means that it has been * used recently - the flag is set in iput_final(). When we encounter such an * inode, clear the flag and move it to the back of the LRU so it gets another * pass through the LRU before it gets reclaimed. This is necessary because of * the fact we are doing lazy LRU updates to minimise lock contention so the * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ static enum lru_status inode_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); /* * we are inverting the lru lock/inode->i_lock here, so use a trylock. * If we fail to get the lock, just skip it. */ if (!spin_trylock(&inode->i_lock)) return LRU_SKIP; /* * Referenced or dirty inodes are still in use. Give them another pass * through the LRU as we canot reclaim them now. */ if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } /* recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(lru_lock); if (remove_inode_buffers(inode)) { unsigned long reap; reap = invalidate_mapping_pages(&inode->i_data, 0, -1); if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += reap; } iput(inode); spin_lock(lru_lock); return LRU_RETRY; } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } /* * Walk the superblock inode LRU for freeable inodes and attempt to free them. * This is called from the superblock shrinker function with a number of inodes * to trim from the LRU. Inodes to be freed are moved to a temporary list and * then are freed outside inode_lock by dispose_list(). */ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(freeable); long freed; freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, inode_lru_isolate, &freeable); dispose_list(&freeable); return freed; } static void __wait_on_freeing_inode(struct inode *inode); /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) { struct inode *inode = NULL; repeat: hlist_for_each_entry(inode, head, i_hash) { if (inode->i_sb != sb) continue; if (!test(inode, data)) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); return inode; } return NULL; } /* * find_inode_fast is the fast path version of find_inode, see the comment at * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) { struct inode *inode = NULL; repeat: hlist_for_each_entry(inode, head, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); return inode; } return NULL; } /* * Each cpu owns a range of LAST_INO_BATCH numbers. * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, * to renew the exhausted range. * * This does not significantly increase overflow rate because every CPU can * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the * 2^32 range, and is a worst-case. Even a 50% wastage would only increase * overflow rate by 2x, which does not seem too significant. * * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ #define LAST_INO_BATCH 1024 static DEFINE_PER_CPU(unsigned int, last_ino); unsigned int get_next_ino(void) { unsigned int *p = &get_cpu_var(last_ino); unsigned int res = *p; #ifdef CONFIG_SMP if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { static atomic_t shared_last_ino; int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); res = next - LAST_INO_BATCH; } #endif res++; /* get_next_ino should not provide a 0 inode number */ if (unlikely(!res)) res++; *p = res; put_cpu_var(last_ino); return res; } EXPORT_SYMBOL(get_next_ino); /** * new_inode_pseudo - obtain an inode * @sb: superblock * * Allocates a new inode for given superblock. * Inode wont be chained in superblock s_inodes list * This means : * - fs can't be unmount * - quotas, fsnotify, writeback can't work */ struct inode *new_inode_pseudo(struct super_block *sb) { struct inode *inode = alloc_inode(sb); if (inode) { spin_lock(&inode->i_lock); inode->i_state = 0; spin_unlock(&inode->i_lock); INIT_LIST_HEAD(&inode->i_sb_list); } return inode; } /** * new_inode - obtain an inode * @sb: superblock * * Allocates a new inode for given superblock. The default gfp_mask * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. * If HIGHMEM pages are unsuitable or it is known that pages allocated * for the page cache are not reclaimable or migratable, * mapping_set_gfp_mask() must be called with suitable flags on the * newly created inode's mapping * */ struct inode *new_inode(struct super_block *sb) { struct inode *inode; spin_lock_prefetch(&sb->s_inode_list_lock); inode = new_inode_pseudo(sb); if (inode) inode_sb_list_add(inode); return inode; } EXPORT_SYMBOL(new_inode); #ifdef CONFIG_DEBUG_LOCK_ALLOC void lockdep_annotate_inode_mutex_key(struct inode *inode) { if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { /* * ensure nobody is actually holding i_mutex */ // mutex_destroy(&inode->i_mutex); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &type->i_mutex_dir_key); } } } EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); #endif /** * unlock_new_inode - clear the I_NEW state and wake up any waiters * @inode: new inode to unlock * * Called when the inode is fully initialised to clear the new state of the * inode and wake up anyone waiting for the inode to finish initialisation. */ void unlock_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; smp_mb(); wake_up_bit(&inode->i_state, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); void discard_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; smp_mb(); wake_up_bit(&inode->i_state, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); } EXPORT_SYMBOL(discard_new_inode); /** * lock_two_nondirectories - take two i_mutexes on non-directory objects * * Lock any non-NULL argument that is not a directory. * Zero, one or two objects may be locked by this function. * * @inode1: first inode to lock * @inode2: second inode to lock */ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1 > inode2) swap(inode1, inode2); if (inode1 && !S_ISDIR(inode1->i_mode)) inode_lock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); /** * unlock_two_nondirectories - release locks from lock_two_nondirectories() * @inode1: first inode to unlock * @inode2: second inode to unlock */ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1 && !S_ISDIR(inode1->i_mode)) inode_unlock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) inode_unlock(inode2); } EXPORT_SYMBOL(unlock_two_nondirectories); /** * inode_insert5 - obtain an inode from a mounted file system * @inode: pre-allocated inode to use for insert to cache * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present it is return it with an increased reference count. This is * a variant of iget5_locked() for callers that don't want to fail on memory * allocation of inode. * * If the inode is not in cache, insert the pre-allocated inode to cache and * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * * Note both @test and @set are called with the inode_hash_lock held, so can't * sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; bool creating = inode->i_state & I_CREATING; again: spin_lock(&inode_hash_lock); old = find_inode(inode->i_sb, head, test, data); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. * Use the old inode instead of the preallocated one. */ spin_unlock(&inode_hash_lock); if (IS_ERR(old)) return NULL; wait_on_inode(old); if (unlikely(inode_unhashed(old))) { iput(old); goto again; } return old; } if (set && unlikely(set(inode, data))) { inode = NULL; goto unlock; } /* * Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); if (!creating) inode_sb_list_add(inode); unlock: spin_unlock(&inode_hash_lock); return inode; } EXPORT_SYMBOL(inode_insert5); /** * iget5_locked - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present it is return it with an increased reference count. This is * a generalized version of iget_locked() for file systems where the inode * number is not sufficient for unique identification of an inode. * * If the inode is not in cache, allocate a new inode and return it locked, * hashed, and with the I_NEW flag set. The file system gets to fill it in * before unlocking it via unlock_new_inode(). * * Note both @test and @set are called with the inode_hash_lock held, so can't * sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct inode *inode = ilookup5(sb, hashval, test, data); if (!inode) { struct inode *new = alloc_inode(sb); if (new) { new->i_state = 0; inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) destroy_inode(new); } } return inode; } EXPORT_SYMBOL(iget5_locked); /** * iget_locked - obtain an inode from a mounted file system * @sb: super block of file system * @ino: inode number to get * * Search for the inode specified by @ino in the inode cache and if present * return it with an increased reference count. This is for file systems * where the inode number is sufficient for unique identification of an inode. * * If the inode is not in cache, allocate a new inode and return it locked, * hashed, and with the I_NEW flag set. The file system gets to fill it in * before unlocking it via unlock_new_inode(). */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: spin_lock(&inode_hash_lock); inode = find_inode_fast(sb, head, ino); spin_unlock(&inode_hash_lock); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } return inode; } inode = alloc_inode(sb); if (inode) { struct inode *old; spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode_fast(sb, head, ino); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); inode->i_state = I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents */ return inode; } /* * Uhhuh, somebody else created the same inode under * us. Use the old inode instead of the one we just * allocated. */ spin_unlock(&inode_hash_lock); destroy_inode(inode); if (IS_ERR(old)) return NULL; inode = old; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(iget_locked); /* * search the inode cache for a matching inode number. * If we find one, then the inode number we are trying to * allocate is not unique and so we should not use it. * * Returns 1 if the inode number is unique, 0 if it is not. */ static int test_inode_iunique(struct super_block *sb, unsigned long ino) { struct hlist_head *b = inode_hashtable + hash(sb, ino); struct inode *inode; hlist_for_each_entry_rcu(inode, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb) return 0; } return 1; } /** * iunique - get a unique inode number * @sb: superblock * @max_reserved: highest reserved inode number * * Obtain an inode number that is unique on the system for a given * superblock. This is used by file systems that have no natural * permanent inode numbering system. An inode number is returned that * is higher than the reserved limit but unique. * * BUGS: * With a large number of inodes live on the file system this function * currently becomes quite slow. */ ino_t iunique(struct super_block *sb, ino_t max_reserved) { /* * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ static DEFINE_SPINLOCK(iunique_lock); static unsigned int counter; ino_t res; rcu_read_lock(); spin_lock(&iunique_lock); do { if (counter <= max_reserved) counter = max_reserved + 1; res = counter++; } while (!test_inode_iunique(sb, res)); spin_unlock(&iunique_lock); rcu_read_unlock(); return res; } EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode->i_lock); if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { __iget(inode); spin_unlock(&inode->i_lock); } else { spin_unlock(&inode->i_lock); /* * Handle the case where s_op->clear_inode is not been * called yet, and somebody is calling igrab * while the inode is getting freed. */ inode = NULL; } return inode; } EXPORT_SYMBOL(igrab); /** * ilookup5_nowait - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * * Search for the inode specified by @hashval and @data in the inode cache. * If the inode is in the cache, the inode is returned with an incremented * reference count. * * Note: I_NEW is not waited upon so you have to be very careful what you do * with the returned inode. You probably should be using ilookup5() instead. * * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; spin_lock(&inode_hash_lock); inode = find_inode(sb, head, test, data); spin_unlock(&inode_hash_lock); return IS_ERR(inode) ? NULL : inode; } EXPORT_SYMBOL(ilookup5_nowait); /** * ilookup5 - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * * Search for the inode specified by @hashval and @data in the inode cache, * and if the inode is in the cache, return the inode with an incremented * reference count. Waits on I_NEW before returning the inode. * returned with an incremented reference count. * * This is a generalized version of ilookup() for file systems where the * inode number is not sufficient for unique identification of an inode. * * Note: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; again: inode = ilookup5_nowait(sb, hashval, test, data); if (inode) { wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(ilookup5); /** * ilookup - search for an inode in the inode cache * @sb: super block of file system to search * @ino: inode number to search for * * Search for the inode @ino in the inode cache, and if the inode is in the * cache, the inode is returned with an incremented reference count. */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: spin_lock(&inode_hash_lock); inode = find_inode_fast(sb, head, ino); spin_unlock(&inode_hash_lock); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(ilookup); /** * find_inode_nowait - find an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @match: callback used for comparisons between inodes * @data: opaque data pointer to pass to @match * * Search for the inode specified by @hashval and @data in the inode * cache, where the helper function @match will return 0 if the inode * does not match, 1 if the inode does match, and -1 if the search * should be stopped. The @match function must be responsible for * taking the i_lock spin_lock and checking i_state for an inode being * freed or being initialized, and incrementing the reference count * before returning 1. It also must not sleep, since it is called with * the inode_hash_lock spinlock held. * * This is a even more generalized version of ilookup5() when the * function must never block --- find_inode() can block in * __wait_on_freeing_inode() --- or when the caller can not increment * the reference count because the resulting iput() might cause an * inode eviction. The tradeoff is that the @match funtion must be * very carefully implemented. */ struct inode *find_inode_nowait(struct super_block *sb, unsigned long hashval, int (*match)(struct inode *, unsigned long, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *ret_inode = NULL; int mval; spin_lock(&inode_hash_lock); hlist_for_each_entry(inode, head, i_hash) { if (inode->i_sb != sb) continue; mval = match(inode, hashval, data); if (mval == 0) continue; if (mval == 1) ret_inode = inode; goto out; } out: spin_unlock(&inode_hash_lock); return ret_inode; } EXPORT_SYMBOL(find_inode_nowait); /** * find_inode_rcu - find an inode in the inode cache * @sb: Super block of file system to search * @hashval: Key to hash * @test: Function to test match on an inode * @data: Data for test function * * Search for the inode specified by @hashval and @data in the inode cache, * where the helper function @test will return 0 if the inode does not match * and 1 if it does. The @test function must be responsible for taking the * i_lock spin_lock and checking i_state for an inode being freed or being * initialized. * * If successful, this will return the inode for which the @test function * returned 1 and NULL otherwise. * * The @test function is not permitted to take a ref on any inode presented. * It is also not permitted to sleep. * * The caller must hold the RCU read lock. */ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_rcu() usage"); hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) return inode; } return NULL; } EXPORT_SYMBOL(find_inode_rcu); /** * find_inode_by_rcu - Find an inode in the inode cache * @sb: Super block of file system to search * @ino: The inode number to match * * Search for the inode specified by @hashval and @data in the inode cache, * where the helper function @test will return 0 if the inode does not match * and 1 if it does. The @test function must be responsible for taking the * i_lock spin_lock and checking i_state for an inode being freed or being * initialized. * * If successful, this will return the inode for which the @test function * returned 1 and NULL otherwise. * * The @test function is not permitted to take a ref on any inode presented. * It is also not permitted to sleep. * * The caller must hold the RCU read lock. */ struct inode *find_inode_by_ino_rcu(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_by_ino_rcu() usage"); hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) return inode; } return NULL; } EXPORT_SYMBOL(find_inode_by_ino_rcu); int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); hlist_for_each_entry(old, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) continue; spin_lock(&old->i_lock); if (old->i_state & (I_FREEING|I_WILL_FREE)) { spin_unlock(&old->i_lock); continue; } break; } if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW | I_CREATING; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); return 0; } if (unlikely(old->i_state & I_CREATING)) { spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); return -EBUSY; } __iget(old); spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; } iput(old); } } EXPORT_SYMBOL(insert_inode_locked); int insert_inode_locked4(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *old; inode->i_state |= I_CREATING; old = inode_insert5(inode, hashval, test, NULL, data); if (old != inode) { iput(old); return -EBUSY; } return 0; } EXPORT_SYMBOL(insert_inode_locked4); int generic_delete_inode(struct inode *inode) { return 1; } EXPORT_SYMBOL(generic_delete_inode); /* * Called when we're dropping the last reference * to an inode. * * Call the FS "drop_inode()" function, defaulting to * the legacy UNIX filesystem behaviour. If it tells * us to evict inode, do so. Otherwise, retain inode * in cache if fs is alive, sync and evict if fs is * shutting down. */ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; unsigned long state; int drop; WARN_ON(inode->i_state & I_NEW); if (op->drop_inode) drop = op->drop_inode(inode); else drop = generic_drop_inode(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { inode_add_lru(inode); spin_unlock(&inode->i_lock); return; } state = inode->i_state; if (!drop) { WRITE_ONCE(inode->i_state, state | I_WILL_FREE); spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); state = inode->i_state; WARN_ON(state & I_NEW); state &= ~I_WILL_FREE; } WRITE_ONCE(inode->i_state, state | I_FREEING); if (!list_empty(&inode->i_lru)) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); } /** * iput - put an inode * @inode: inode to put * * Puts an inode, dropping its usage count. If the inode use count hits * zero, the inode is then freed and may also be destroyed. * * Consequently, iput() can sleep. */ void iput(struct inode *inode) { if (!inode) return; BUG_ON(inode->i_state & I_CLEAR); retry: if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); trace_writeback_lazytime_iput(inode); mark_inode_dirty_sync(inode); goto retry; } iput_final(inode); } } EXPORT_SYMBOL(iput); #ifdef CONFIG_BLOCK /** * bmap - find a block number in a file * @inode: inode owning the block number being requested * @block: pointer containing the block to find * * Replaces the value in ``*block`` with the block number on the device holding * corresponding to the requested block number in the file. * That is, asked for block 4 of inode 1 the function will replace the * 4 in ``*block``, with disk block relative to the disk start that holds that * block of the file. * * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a * hole, returns 0 and ``*block`` is also set to 0. */ int bmap(struct inode *inode, sector_t *block) { if (!inode->i_mapping->a_ops->bmap) return -EINVAL; *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block); return 0; } EXPORT_SYMBOL(bmap); #endif /* * With relative atime, only update atime if the previous atime is * earlier than either the ctime or mtime or if at least a day has * passed since the last atime update. */ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { if (!(mnt->mnt_flags & MNT_RELATIME)) return 1; /* * Is mtime younger than atime? If yes, update atime: */ if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) return 1; /* * Is ctime younger than atime? If yes, update atime: */ if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) return 1; /* * Is the previous atime value older than a day? If yes, * update atime: */ if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) return 1; /* * Good, we can skip the atime update: */ return 0; } int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) { int iflags = I_DIRTY_TIME; bool dirty = false; if (flags & S_ATIME) inode->i_atime = *time; if (flags & S_VERSION) dirty = inode_maybe_inc_iversion(inode, false); if (flags & S_CTIME) inode->i_ctime = *time; if (flags & S_MTIME) inode->i_mtime = *time; if ((flags & (S_ATIME | S_CTIME | S_MTIME)) && !(inode->i_sb->s_flags & SB_LAZYTIME)) dirty = true; if (dirty) iflags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, iflags); return 0; } EXPORT_SYMBOL(generic_update_time); /* * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ int inode_update_time(struct inode *inode, struct timespec64 *time, int flags) { if (inode->i_op->update_time) return inode->i_op->update_time(inode, time, flags); return generic_update_time(inode, time, flags); } EXPORT_SYMBOL(inode_update_time); /** * touch_atime - update the access time * @path: the &struct path to update * @inode: inode to update * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ bool atime_needs_update(const struct path *path, struct inode *inode) { struct vfsmount *mnt = path->mnt; struct timespec64 now; if (inode->i_flags & S_NOATIME) return false; /* Atime updates will likely cause i_uid and i_gid to be written * back improprely if their true value is unknown to the vfs. */ if (HAS_UNMAPPED_ID(inode)) return false; if (IS_NOATIME(inode)) return false; if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; if (mnt->mnt_flags & MNT_NOATIME) return false; if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; now = current_time(inode); if (!relatime_need_update(mnt, inode, now)) return false; if (timespec64_equal(&inode->i_atime, &now)) return false; return true; } void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = d_inode(path->dentry); struct timespec64 now; if (!atime_needs_update(path, inode)) return; if (!sb_start_write_trylock(inode->i_sb)) return; if (__mnt_want_write(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to * allocate new space to modify an inode (such is the case for * Btrfs), but since we touch atime while walking down the path we * really don't care if we failed to update the atime of the file, * so just ignore the return value. * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ now = current_time(inode); inode_update_time(inode, &now, S_ATIME); __mnt_drop_write(mnt); skip_update: sb_end_write(inode->i_sb); } EXPORT_SYMBOL(touch_atime); /* * The logic we want is * * if suid or (sgid and xgrp) * remove privs */ int should_remove_suid(struct dentry *dentry) { umode_t mode = d_inode(dentry)->i_mode; int kill = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; /* * sgid without any exec bits is just a mandatory locking mark; leave * it alone. If some exec bits are set, it's a real sgid; kill it. */ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) kill |= ATTR_KILL_SGID; if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; return 0; } EXPORT_SYMBOL(should_remove_suid); /* * Return mask of changes for notify_change() that need to be done as a * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ int dentry_needs_remove_privs(struct dentry *dentry) { struct inode *inode = d_inode(dentry); int mask = 0; int ret; if (IS_NOSEC(inode)) return 0; mask = should_remove_suid(dentry); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; if (ret) mask |= ATTR_KILL_PRIV; return mask; } static int __remove_privs(struct dentry *dentry, int kill) { struct iattr newattrs; newattrs.ia_valid = ATTR_FORCE | kill; /* * Note we call this on write, so notify_change will not * encounter any conflicting delegations: */ return notify_change(dentry, &newattrs, NULL); } /* * Remove special file priviledges (suid, capabilities) when file is written * to or truncated. */ int file_remove_privs(struct file *file) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); int kill; int error = 0; /* * Fast path for nothing security related. * As well for non-regular files, e.g. blkdev inodes. * For example, blkdev_write_iter() might get here * trying to remove privs which it is not allowed to. */ if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(dentry); if (kill < 0) return kill; if (kill) error = __remove_privs(dentry, kill); if (!error) inode_has_no_xattr(inode); return error; } EXPORT_SYMBOL(file_remove_privs); /** * file_update_time - update mtime and ctime time * @file: file accessed * * Update the mtime and ctime members of an inode and mark the inode * for writeback. Note that this function is meant exclusively for * usage in the file write path of filesystems, and filesystems may * choose to explicitly ignore update via this function with the * S_NOCMTIME inode flag, e.g. for network filesystem where these * timestamps are handled by the server. This can return an error for * file systems who need to allocate space in order to update an inode. */ int file_update_time(struct file *file) { struct inode *inode = file_inode(file); struct timespec64 now; int sync_it = 0; int ret; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; now = current_time(inode); if (!timespec64_equal(&inode->i_mtime, &now)) sync_it = S_MTIME; if (!timespec64_equal(&inode->i_ctime, &now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) sync_it |= S_VERSION; if (!sync_it) return 0; /* Finally allowed to write? Takes lock. */ if (__mnt_want_write_file(file)) return 0; ret = inode_update_time(inode, &now, sync_it); __mnt_drop_write_file(file); return ret; } EXPORT_SYMBOL(file_update_time); /* Caller must hold the file's inode lock */ int file_modified(struct file *file) { int err; /* * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ err = file_remove_privs(file); if (err) return err; if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; return file_update_time(file); } EXPORT_SYMBOL(file_modified); int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) return 1; if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) return 1; return 0; } EXPORT_SYMBOL(inode_needs_sync); /* * If we try to find an inode in the inode hash while it is being * deleted, we have to wait until the filesystem completes its * deletion before reporting that it isn't found. This function waits * until the deletion _might_ have completed. Callers are responsible * to recheck inode state. * * It doesn't matter if I_NEW is not set initially, a call to * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ static void __wait_on_freeing_inode(struct inode *inode) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq, &wait.wq_entry); spin_lock(&inode_hash_lock); } static __initdata unsigned long ihash_entries; static int __init set_ihash_entries(char *str) { if (!str) return 0; ihash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("ihash_entries=", set_ihash_entries); /* * Initialize the waitqueues and inode hash table. */ void __init inode_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; inode_hashtable = alloc_large_system_hash("Inode-cache", sizeof(struct hlist_head), ihash_entries, 14, HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); } void __init inode_init(void) { /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ if (!hashdist) return; inode_hashtable = alloc_large_system_hash("Inode-cache", sizeof(struct hlist_head), ihash_entries, 14, HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; if (S_ISCHR(mode)) { inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; } else if (S_ISBLK(mode)) { inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; } else if (S_ISFIFO(mode)) inode->i_fop = &pipefifo_fops; else if (S_ISSOCK(mode)) ; /* leave it no_open_fops */ else printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, inode->i_ino); } EXPORT_SYMBOL(init_special_inode); /** * inode_init_owner - Init uid,gid,mode for new inode according to posix standards * @inode: New inode * @dir: Directory inode * @mode: mode of the new inode */ void inode_init_owner(struct inode *inode, const struct inode *dir, umode_t mode) { inode->i_uid = current_fsuid(); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; /* Directories are special, and always inherit S_ISGID */ if (S_ISDIR(mode)) mode |= S_ISGID; else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && !in_group_p(inode->i_gid) && !capable_wrt_inode_uidgid(dir, CAP_FSETID)) mode &= ~S_ISGID; } else inode->i_gid = current_fsgid(); inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); /** * inode_owner_or_capable - check current task permissions to inode * @inode: inode being checked * * Return true if current either has CAP_FOWNER in a namespace with the * inode owner uid mapped, or owns the file. */ bool inode_owner_or_capable(const struct inode *inode) { struct user_namespace *ns; if (uid_eq(current_fsuid(), inode->i_uid)) return true; ns = current_user_ns(); if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } EXPORT_SYMBOL(inode_owner_or_capable); /* * Direct i/o helper functions */ static void __inode_dio_wait(struct inode *inode) { wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); do { prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); if (atomic_read(&inode->i_dio_count)) schedule(); } while (atomic_read(&inode->i_dio_count)); finish_wait(wq, &q.wq_entry); } /** * inode_dio_wait - wait for outstanding DIO requests to finish * @inode: inode to wait for * * Waits for all pending direct I/O requests to finish so that we can * proceed with a truncate or equivalent operation. * * Must be called under a lock that serializes taking new references * to i_dio_count, usually by inode->i_mutex. */ void inode_dio_wait(struct inode *inode) { if (atomic_read(&inode->i_dio_count)) __inode_dio_wait(inode); } EXPORT_SYMBOL(inode_dio_wait); /* * inode_set_flags - atomically set some inode flags * * Note: the caller should be holding i_mutex, or else be sure that * they have exclusive access to the inode structure (i.e., while the * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify * i_flags actually followed this rule, is that there is at least one * code path which doesn't today so we use cmpxchg() out of an abundance * of caution. * * In the long run, i_mutex is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure * it is so documented in include/linux/fs.h and that all code follows * the locking convention!! */ void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask) { WARN_ON_ONCE(flags & ~mask); set_mask_bits(&inode->i_flags, mask, flags); } EXPORT_SYMBOL(inode_set_flags); void inode_nohighmem(struct inode *inode) { mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } EXPORT_SYMBOL(inode_nohighmem); /** * timestamp_truncate - Truncate timespec to a granularity * @t: Timespec * @inode: inode being updated * * Truncate a timespec to the granularity supported by the fs * containing the inode. Always rounds down. gran must * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). */ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned int gran = sb->s_time_gran; t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max); if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min)) t.tv_nsec = 0; /* Avoid division in the common cases 1 ns and 1 s. */ if (gran == 1) ; /* nothing */ else if (gran == NSEC_PER_SEC) t.tv_nsec = 0; else if (gran > 1 && gran < NSEC_PER_SEC) t.tv_nsec -= t.tv_nsec % gran; else WARN(1, "invalid file time granularity: %u", gran); return t; } EXPORT_SYMBOL(timestamp_truncate); /** * current_time - Return FS time * @inode: inode. * * Return the current time truncated to the time granularity supported by * the fs. * * Note that inode and inode->sb cannot be NULL. * Otherwise, the function warns and returns time without truncation. */ struct timespec64 current_time(struct inode *inode) { struct timespec64 now; ktime_get_coarse_real_ts64(&now); if (unlikely(!inode->i_sb)) { WARN(1, "current_time() called with uninitialized super_block in the inode"); return now; } return timestamp_truncate(now, inode); } EXPORT_SYMBOL(current_time); /* * Generic function to check FS_IOC_SETFLAGS values and reject any invalid * configurations. * * Note: the caller should be holding i_mutex, or else be sure that they have * exclusive access to the inode structure. */ int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags, unsigned int flags) { /* * The IMMUTABLE and APPEND_ONLY flags can only be changed by * the relevant capability. * * This test looks nicer. Thanks to Pauline Middelink */ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; return fscrypt_prepare_setflags(inode, oldflags, flags); } EXPORT_SYMBOL(vfs_ioc_setflags_prepare); /* * Generic function to check FS_IOC_FSSETXATTR values and reject any invalid * configurations. * * Note: the caller should be holding i_mutex, or else be sure that they have * exclusive access to the inode structure. */ int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa, struct fsxattr *fa) { /* * Can't modify an immutable/append-only file unless we have * appropriate permission. */ if ((old_fa->fsx_xflags ^ fa->fsx_xflags) & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; /* * Project Quota ID state is only allowed to change from within the init * namespace. Enforce that restriction only if we are trying to change * the quota ID state. Everything else is allowed in user namespaces. */ if (current_user_ns() != &init_user_ns) { if (old_fa->fsx_projid != fa->fsx_projid) return -EINVAL; if ((old_fa->fsx_xflags ^ fa->fsx_xflags) & FS_XFLAG_PROJINHERIT) return -EINVAL; } /* Check extent size hints. */ if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode)) return -EINVAL; if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && !S_ISDIR(inode->i_mode)) return -EINVAL; if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) && !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return -EINVAL; /* * It is only valid to set the DAX flag on regular files and * directories on filesystems. */ if ((fa->fsx_xflags & FS_XFLAG_DAX) && !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; /* Extent size hints of zero turn off the flags. */ if (fa->fsx_extsize == 0) fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); if (fa->fsx_cowextsize == 0) fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; return 0; } EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_CRED_H #define _LINUX_CRED_H #include <linux/capability.h> #include <linux/init.h> #include <linux/key.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/sched.h> #include <linux/sched/user.h> struct cred; struct inode; /* * COW Supplementary groups list */ struct group_info { atomic_t usage; int ngroups; kgid_t gid[0]; } __randomize_layout; /** * get_group_info - Get a reference to a group info structure * @group_info: The group info to reference * * This gets a reference to a set of supplementary groups. * * If the caller is accessing a task's credentials, they must hold the RCU read * lock when reading. */ static inline struct group_info *get_group_info(struct group_info *gi) { atomic_inc(&gi->usage); return gi; } /** * put_group_info - Release a reference to a group info structure * @group_info: The group info to release */ #define put_group_info(group_info) \ do { \ if (atomic_dec_and_test(&(group_info)->usage)) \ groups_free(group_info); \ } while (0) extern struct group_info init_groups; #ifdef CONFIG_MULTIUSER extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); extern int in_group_p(kgid_t); extern int in_egroup_p(kgid_t); extern int groups_search(const struct group_info *, kgid_t); extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern bool may_setgroups(void); extern void groups_sort(struct group_info *); #else static inline void groups_free(struct group_info *group_info) { } static inline int in_group_p(kgid_t grp) { return 1; } static inline int in_egroup_p(kgid_t grp) { return 1; } static inline int groups_search(const struct group_info *group_info, kgid_t grp) { return 1; } #endif /* * The security context of a task * * The parts of the context break down into two categories: * * (1) The objective context of a task. These parts are used when some other * task is attempting to affect this one. * * (2) The subjective context. These details are used when the task is acting * upon another object, be that a file, a task, a key or whatever. * * Note that some members of this structure belong to both categories - the * LSM security pointer for instance. * * A task has two security pointers. task->real_cred points to the objective * context that defines that task's actual details. The objective part of this * context is used whenever that task is acted upon. * * task->cred points to the subjective context that defines the details of how * that task is going to act upon another object. This may be overridden * temporarily to point to another security context, but normally points to the * same context as task->real_cred. */ struct cred { atomic_t usage; #ifdef CONFIG_DEBUG_CREDENTIALS atomic_t subscribers; /* number of processes subscribed */ void *put_addr; unsigned magic; #define CRED_MAGIC 0x43736564 #define CRED_MAGIC_DEAD 0x44656144 #endif kuid_t uid; /* real UID of the task */ kgid_t gid; /* real GID of the task */ kuid_t suid; /* saved UID of the task */ kgid_t sgid; /* saved GID of the task */ kuid_t euid; /* effective UID of the task */ kgid_t egid; /* effective GID of the task */ kuid_t fsuid; /* UID for VFS ops */ kgid_t fsgid; /* GID for VFS ops */ unsigned securebits; /* SUID-less security management */ kernel_cap_t cap_inheritable; /* caps our children can inherit */ kernel_cap_t cap_permitted; /* caps we're permitted */ kernel_cap_t cap_effective; /* caps we can actually use */ kernel_cap_t cap_bset; /* capability bounding set */ kernel_cap_t cap_ambient; /* Ambient capability set */ #ifdef CONFIG_KEYS unsigned char jit_keyring; /* default keyring to attach requested * keys to */ struct key *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ #endif #ifdef CONFIG_SECURITY void *security; /* subjective LSM security */ #endif struct user_struct *user; /* real user ID subscription */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ /* RCU deletion */ union { int non_rcu; /* Can we skip RCU deletion? */ struct rcu_head rcu; /* RCU deletion hook */ }; } __randomize_layout; extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, unsigned long); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern const struct cred *override_creds(const struct cred *); extern void revert_creds(const struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); extern int change_create_files_as(stru